# R Options
options(stringsAsFactors=FALSE,
citation_format="pandoc",
dplyr.summarise.inform=FALSE,
knitr.table.format="html",
kableExtra_view_html=TRUE,
future.globals.maxSize=+Inf,
mc.cores=params$cores,
future.fork.enable=TRUE, future.plan="multicore",
future.rng.onMisuse="ignore")
# Python3 needed for clustering, umap, other python packages
# Path to binary will be automatically found
# Set manually if it does not work
reticulate_python3_path = unname(Sys.which("python3"))
Sys.setenv(RETICULATE_PYTHON=reticulate_python3_path)
# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(reticulate) # required for "leiden" clustering
library(enrichR) # functional enrichment
library(future) # multicore support for Seurat
# Other libraries we use
# Knit: knitr
# Data handling: dplyr, tidyr, purrr, stringr, Matrix, sctransform, glmGamPoi (optional for speed but only available for R 4.0)
# Tables: kableExtra, DT
# Plots: ggsci
# IO: openxlsx, readr, R.utils
# Annotation: biomaRt
# DEG: mast, limma (for a more efficient implementation of the Wilcoxon Rank Sum Test according to Seurat)
# Functional enrichment: enrichR
# Other: sessioninfo, cerebroApp, sceasy, ROpenSci/bibtex, knitcitations
# Knitr default options
knitr::opts_chunk$set(echo=TRUE, # output code
cache=FALSE, # do not cache results
message=TRUE, # show messages
warning=TRUE, # show warnings
tidy=FALSE, # do not auto-tidy-up code
fig.width=10, # default fig width in inches
class.source="fold-hide", # by default collapse code blocks
dev=c("png", "pdf"), # create figures in png and pdf; the first device (png) will be used for HTML output
dev.args=list(png=list(type="cairo"), # png: use cairo - works on cluster, supports anti-aliasing (more smooth)
pdf=list(bg="white")), # pdf: use cairo - works on cluster, supports anti-aliasing (more smooth)
dpi=96, # figure resolution
fig.retina=2 # retina multiplier
)
# If the DOI publication servers cannot be reached, there will be no citations, knitcitations will not write a references.bib file and pandoc will stop. This makes sure that there is always at least one citation.
invisible(knitcitations::citep(citation("knitr")))cat(paste0('<link rel="stylesheet" href="', params$path_to_git, '/css/style.css">'))# Copy input parameters to internal parameter list
param = params
if (is.null(param$samples_to_drop )) param$samples_to_drop = c()
if (is.null(param$vars_to_regress)) param$vars_to_regress = c()
if (is.null(param$latent_vars)) param$latent_vars = c()
# Git directory and files to source must be done first, then all helper functions can be sourced
git_files_to_source = c("functions_io.R",
"functions_plotting.R",
"functions_analysis.R",
"functions_degs.R",
"functions_util.R")
git_files_to_source = file.path(param$path_to_git, "R", git_files_to_source)
file_exists = purrr::map_lgl(git_files_to_source, file.exists)
if (any(!file_exists)) stop(paste("The following files could not be found:", paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", param$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))
# Debugging mode:
switch (param$debugging_mode,
default_debugging=on_error_default_debugging(),
terminal_debugger=on_error_start_terminal_debugger(),
print_traceback=on_error_just_print_traceback(),
on_error_default_debugging())
# Set output hooks
knitr::knit_hooks$set(message=format_message, warning=format_warning)
# Create output directories
if (!file.exists(param$path_out)) dir.create(param$path_out, recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "figures"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "marker_degs"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "annotation"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "data"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "export"), recursive=TRUE, showWarnings=FALSE)
# Path for figures in png and pdf format (trailing "/" is needed)
knitr::opts_chunk$set(fig.path=paste0(file.path(param$path_out, "figures"), "/"))
# Path for annotation
param$file_annot = file.path(param$path_out, "annotation", paste0(param$mart_dataset, ".v", param$annot_version, ".annot.txt"))
# Path for cell cycle genes file
param$file_cc_genes = file.path(param$path_out, "annotation", "cell_cycle_markers.xlsx")
# Do checks
error_messages = c()
# Check parameters and parse entries (e.g. numbers) so that they are valid
param = check_parameters_scrnaseq(param)
error_messages = c(error_messages, param[["error_messages"]])
# Check installed packages
error_messages = c(error_messages, check_installed_packages_scrnaseq())
# Check python
error_messages = c(error_messages, check_python())
# Check pandoc
error_messages = c(error_messages, check_pandoc())
# Check enrichR
error_messages = c(error_messages, check_enrichr(param$enrichr_dbs, param$enrichr_site))
# Check ensembl
error_messages = c(error_messages, check_ensembl(biomart="ensembl",
dataset=param$mart_dataset,
mirror=param$biomart_mirror,
version=param$annot_version,
attributes=param$mart_attributes,
file_annot=param$file_annot,
file_cc_markers=param$file_cc_genes))
# Stop here if there are errors
if (length(error_messages)) stop(paste(c("", paste("*", error_messages)), collapse="\n"))The workflow can be run for pre-processed 10x data, as well as for pre-processed SmartSeq-2 or other data that are represented by a simple table with transcript counts per gene and cell.
Prior to this workflow, raw sequencing reads were mapped to the reference transcriptome. The resulting mapping statistics are printed below for a first estimation of sample quality.
# Are metrics provided?
if (!all(is.na(param$path_data$stats))) {
# Loop through all samples and read metrics
mapping_stats_list = list()
for (i in 1:nrow(param$path_data)) {
if (!is.na(param$path_data$stats[i])) {
mapping_stats_list[[param$path_data$name[i]]] = Read10XMetrics(param$path_data$stats[i])
}
}
# Join all mapping stats tables
mapping_stats = mapping_stats_list %>% purrr::reduce(dplyr::full_join, by="metric")
rownames(mapping_stats) = mapping_stats[["metric"]]
mapping_stats = mapping_stats %>% dplyr::select(-metric)
colnames(mapping_stats) = names(mapping_stats_list)
# Print table to HTML
knitr::kable(mapping_stats, align="l", caption="General metrics") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", fixed_thead=TRUE)
} else {
message("Mapping statistics cannot be shown. No valid file provided.")
}× (Message)
Mapping statistics cannot be shown. No valid file provided.
We read gene annotation from Ensembl and write the resulting table to file. We generate several dictionaries to translate between Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names.
# Read annotation from csv or from Ensembl and a tab separated txt will be created
if (file.exists(param$file_annot)) {
annot_ensembl = read.delim(param$file_annot)
} else {
annot_mart = suppressWarnings(GetBiomaRt(biomart="ensembl",
dataset=param$mart_dataset,
mirror=param$biomart_mirror,
version=param$annot_version))
annot_ensembl = biomaRt::getBM(mart=annot_mart, attributes=param$mart_attributes, useCache=FALSE)
write.table(annot_ensembl, file=param$file_annot, sep="\t", col.names=TRUE, row.names=FALSE, append=FALSE)
message("Gene annotation file was created at: ", param$file_annot)
# Note: depending on the attributes, there might be more than one row per gene
}
# Double-check if we got all required annotation, in case annotation file was read
check_annot_main = all(param$annot_main %in% colnames(annot_ensembl))
if (!check_annot_main) {
stop("The annotation table misses at least one of the following columns: ", paste(param$annot_main, collapse=", "))
}
# Create translation tables
ensembl = param$annot_main["ensembl"]
symbol = param$annot_main["symbol"]
entrez = param$annot_main["entrez"]
# Ensembl id to gene symbol (new)
ensembl_to_symbol = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_symbol[, symbol] = ifelse(nchar(ensembl_to_symbol[, symbol]) == 0, NA, ensembl_to_symbol[, symbol])
ensembl_to_symbol = setNames(ensembl_to_symbol[, symbol], ensembl_to_symbol[, ensembl])
# Ensembl id to seurat-compatible unique rowname (new)
ensembl_to_seurat_rowname = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_seurat_rowname[, symbol] = ifelse(nchar(ensembl_to_seurat_rowname[, symbol]) == 0, ensembl_to_seurat_rowname[, ensembl], ensembl_to_seurat_rowname[, symbol])
ensembl_to_seurat_rowname[, symbol] = make.unique(gsub(pattern="_", replacement="-", x=ensembl_to_seurat_rowname[, symbol], fixed=TRUE))
ensembl_to_seurat_rowname = setNames(ensembl_to_seurat_rowname[, symbol], ensembl_to_seurat_rowname[, ensembl])
# Seurat-compatible unique rowname to ensembl id
seurat_rowname_to_ensembl = setNames(names(ensembl_to_seurat_rowname), ensembl_to_seurat_rowname)
# Ensembl to Entrez
ensembl_to_entrez = unique(annot_ensembl[, c(ensembl, entrez)])
ensembl_to_entrez[, entrez] = ifelse(nchar(ensembl_to_entrez[, entrez]) == 0, NA, ensembl_to_entrez[, entrez])
ensembl_to_entrez = split(ensembl_to_entrez[, entrez], ensembl_to_entrez[, ensembl])
# Seurat-compatible unique rowname to Entrez
seurat_rowname_to_ensembl_match = match(seurat_rowname_to_ensembl, names(ensembl_to_entrez))
names(seurat_rowname_to_ensembl_match) = names(seurat_rowname_to_ensembl)
seurat_rowname_to_entrez = purrr::map(seurat_rowname_to_ensembl_match, function(i) {unname(ensembl_to_entrez[[i]])})
# Entrez IDs is duplicating Ensembl IDs in annot_ensembl
# Therefore, we remove Entrez IDs from the annotation table, after generating all required translation tables
# Set rownames of annotation table to Ensembl identifiers
annot_ensembl = annot_ensembl[, -match(entrez, colnames(annot_ensembl))] %>% unique() %>% as.data.frame()
rownames(annot_ensembl) = annot_ensembl[, ensembl]# Use biomart to translate human cell cycle genes to the species of interest and save them in a file
if (file.exists(param$file_cc_genes)) {
# Load from file
genes_s = openxlsx::read.xlsx(param$file_cc_genes, sheet=1)
genes_g2m = openxlsx::read.xlsx(param$file_cc_genes, sheet=2)
} else {
# Obtain from Ensembl
# Note: both mart objects must point to the same mirror for biomarT::getLDS to work
mart_human = suppressWarnings(GetBiomaRt(biomart="ensembl",
dataset="hsapiens_gene_ensembl",
mirror=param$biomart_mirror,
version=param$annot_version))
mart_myspecies = suppressWarnings(GetBiomaRt(biomart="ensembl",
dataset=param$mart_dataset,
mirror=GetBiomaRtMirror(mart_human),
version=param$annot_version))
# S phase marker
genes_s = biomaRt::getLDS(attributes=c("ensembl_gene_id", "external_gene_name"),
filters="external_gene_name",
values=Seurat::cc.genes.updated.2019$s.genes,
mart=mart_human,
attributesL=c("ensembl_gene_id", "external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)
colnames(genes_s) = c("Human_ensembl_id", "Human_gene_name", "Species_ensembl_id", "Species_gene_name")
genes_s = genes_s %>% dplyr::arrange(Human_gene_name)
# G2/M marker
genes_g2m = biomaRt::getLDS(attributes=c("ensembl_gene_id", "external_gene_name"),
filters="external_gene_name",
values=Seurat::cc.genes.updated.2019$g2m.genes,
mart=mart_human,
attributesL=c("ensembl_gene_id", "external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)
colnames(genes_g2m) = c("Human_ensembl_id", "Human_gene_name", "Species_ensembl_id", "Species_gene_name")
genes_g2m = genes_g2m %>% dplyr::arrange(Human_gene_name)
# Write to file
openxlsx::write.xlsx(list(S_phase=genes_s,G2M_phase=genes_g2m), file=param$file_cc_genes)
}
# Convert Ensembl ID to Seurat-compatible unique rowname
genes_s = data.frame(Human_gene_name=genes_s$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_s$Species_ensembl_id]))
genes_g2m = data.frame(Human_gene_name=genes_g2m$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_g2m$Species_ensembl_id]))Seurat is an R-package that is used for the analysis of single-cell RNA-seq data. We read pre-processed data as described above, and convert it to a count matrix in R. In the case of 10x data, the count matrix contains the number of unique RNA molecules (UMIs) per gene and cell. In the case of SmartSeq-2 data, the count matrix contains the number of reads per gene and cell.
In addition to the count matrix, the workflow stores additional information in the Seurat object, including but not limited to the normalized data, dimensionality reduction and cluster results.# List of Seurat objects
sc = list()
datasets = param$path_data
for (i in seq(nrow(datasets))) {
name = datasets[i, "name"]
type = datasets[i, "type"]
path = datasets[i, "path"]
suffix = datasets[i, "suffix"]
# Read 10X or smartseq2
if (type == "10x") {
# Read 10X sparse matrix into a Seurat object
sc = c(sc, ReadSparseMatrix(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, cellnames_suffix=suffix))
} else if (type == "smartseq2") {
# Read counts table into a Seurat object
sc = c(sc, ReadCountsTable(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, parse_plate_information=TRUE, return_samples_as_datasets=TRUE, cellnames_suffix=suffix))
}
}
# Make sure that sample names are unique. If not, just prefix with the dataset name. Also set orig.ident to this name.
sample_names = names(sc)
duplicated_sample_names_idx = which(sample_names %in% sample_names[duplicated(sample_names)])
for (i in duplicated_sample_names_idx) {
sample_names[i] = paste(head(sc[[i]][["orig.dataset", drop=TRUE]], 1), sample_names[i], sep=".")
sc[[i]][["orig.ident"]] = sample_names[i]
}
# Set up colors for samples and add them to the sc objects
sample_names = purrr::flatten_chr(purrr::map(sc, function(s) {
nms = unique(as.character(s[[]][["orig.ident"]]))
return(nms)
}))
param$col_samples = GenerateColours(num_colours=length(sample_names), names=sample_names, palette=param$col_palette_samples, alphas=1)
sc = purrr::map(sc, ScAddLists, lists=list(orig.ident=param$col_samples), lists_slot="colour_lists")
# Downsample cells if requested
if (!is.null(param$downsample_cells_n)) {
sc = purrr::map(sc, function(s) {
cells = ScSampleCells(sc=s, n=param$downsample_cells_n, seed=1)
return(subset(s, cells=cells))
})
}
sc## $pbmc_10x
## An object of class Seurat
## 33694 features across 100 samples within 1 assay
## Active assay: RNA (33694 features, 0 variable features)
##
## $pbmc_smartseq2_sample1
## An object of class Seurat
## 33694 features across 100 samples within 1 assay
## Active assay: RNA (33694 features, 0 variable features)
The following first table shows available metadata (columns) of the first 5 cells (rows). These metadata provide additional information about the cells in the dataset, such as the sample a cell belongs to (“orig.ident”), the number of mapped reads (“nCounts_RNA”), or the above mentioned number of unique genes detected (“nFeature_RNA”). The second table shows available metadata (columns) of the first 5 genes (rows).
# Combine cell metadata of the Seurat objects into one big metadata
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s) return(s[[]])) %>% as.data.frame())
# Print cell metadata
knitr::kable(head(sc_cell_metadata), align="l", caption="Cell metadata, top 5 rows") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%")| orig.ident | nCount_RNA | nFeature_RNA | orig.dataset | SampleName | PlateNumber | PlateRow | PlateCol | |
|---|---|---|---|---|---|---|---|---|
| PBMC1_10x_CACAGATGTAACAGTA-1 | pbmc_10x | 3538 | 1316 | pbmc_10x | NA | NA | NA | NA |
| PBMC1_10x_TTGACCCGTTCAGTAC-1 | pbmc_10x | 7333 | 2109 | pbmc_10x | NA | NA | NA | NA |
| PBMC1_10x_AGTGCCGCAGGGAATC-1 | pbmc_10x | 5560 | 1915 | pbmc_10x | NA | NA | NA | NA |
| PBMC1_10x_GCATCTCTCCTGCTAC-1 | pbmc_10x | 8918 | 2801 | pbmc_10x | NA | NA | NA | NA |
| PBMC1_10x_CAAAGAAGTGACGTCC-1 | pbmc_10x | 4487 | 1426 | pbmc_10x | NA | NA | NA | NA |
| PBMC1_10x_CGGTCAGCACGGGTAA-1 | pbmc_10x | 4467 | 1513 | pbmc_10x | NA | NA | NA | NA |
# Print gene metadata
knitr::kable(head(sc[[1]][[param$assay_raw]][[]], 5), align="l", caption="Feature metadata, top 5 rows (only first dataset shown)") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%")| feature_id | feature_name | feature_type | |
|---|---|---|---|
| TSPAN6 | ENSG00000000003 | TSPAN6 | Gene Expression |
| TNMD | ENSG00000000005 | TNMD | Gene Expression |
| DPM1 | ENSG00000000419 | DPM1 | Gene Expression |
| SCYL3 | ENSG00000000457 | SCYL3 | Gene Expression |
| C1orf112 | ENSG00000000460 | C1orf112 | Gene Expression |
The steps below represent a standard pre-processing workflow for single-cell RNA-seq data, including quality control, the respective filtering of cells and genes, data normalization and scaling, and the detection of highly variable genes.
We start the analysis by removing unwanted cells from the dataset(s). Three commonly used QC metrics include the number of unique genes detected in each cell (“nFeature_RNA”), the total number of molecules detected in each cell (“nCount_RNA”), and the percentage of counts that map to the mitochrondrial genome (“percent_mt”). If ERCC spike-in controls were used, the percentage of counts mapping to them is also shown (“percent_ercc”).
# If filters were specified globally (i.e. not by sample), this chunk will copy them for each sample such that downstream filtering can work by sample
param$cell_filter = purrr::map(list_names(sc), function(s) {
if (s %in% names(param$cell_filter)) {
return(param$cell_filter[[s]])
} else {
return(param$cell_filter)
}
})
param$feature_filter = purrr::map(list_names(sc), function(s) {
if (s %in% names(param$feature_filter)) {
return(param$feature_filter[[s]])
} else {
return(param$feature_filter)
}
})# Calculate percentage of counts in mitochondrial genes for each Seurat object
sc = purrr::map(sc, function(s) {
mt_features = grep(pattern=param$mt, rownames(s), value=TRUE)
return(Seurat::PercentageFeatureSet(s, features=mt_features, col.name="percent_mt", assay=param$assay_raw))
})
# Calculate percentage of counts in ribosomal genes for each Seurat object
sc = purrr::map(sc, function(s) {
ribo_features = grep(pattern="^RP[SL]", rownames(s), value=TRUE, ignore.case=TRUE)
return(Seurat::PercentageFeatureSet(s, features=ribo_features, col.name="percent_ribo", assay=param$assay_raw))
})
# Calculate percentage of counts in ERCC for each Seurat object (if assay is available)
sc = purrr::map(sc, function(s) {
if ("ERCC" %in% Seurat::Assays(s)) s$percent_ercc = s$nCount_ERCC/(s$nCount_ERCC + s$nCount_RNA)*100
return(s)
})
# Combine (again) cell metadata of the Seurat objects into one big metadata, this time including mt and ercc
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s){ return(s[[]]) }) %>% as.data.frame())# Only RNA assay at the moment
# counts_median uses sapply on the counts matrix, which converts the sparse matrix into a normal matrix
# This might have to be adapted in future (Sparse Matrix Apply Function)
sc = purrr::map(list_names(sc), function(n) {
# Calculate percentage of counts per gene in a cell
counts_rna = Seurat::GetAssayData(sc[[n]], slot="counts", assay=param$assay_raw)
total_counts = sc[[n]][[paste0("nCount_", param$assay_raw), drop=TRUE]]
counts_rna_perc = Matrix::t(Matrix::t(counts_rna)/total_counts)*100
# Calculate feature filters
num_cells_expr = Matrix::rowSums(counts_rna >= 1)
num_cells_expr_threshold = Matrix::rowSums(counts_rna >= param$feature_filter[[n]][["min_counts"]])
# Calculate median of counts_rna_perc per gene
counts_median = apply(counts_rna_perc, 1, median)
# Add all QC measures as metadata
sc[[n]][[param$assay_raw]] = Seurat::AddMetaData(sc[[n]][[param$assay_raw]], data.frame(num_cells_expr, num_cells_expr_threshold, counts_median))
return(sc[[n]])
})# QC metrics to plot for cells
cell_qc_features = c(paste0(c("nFeature_", "nCount_"), param$assay_raw), "percent_mt")
if ("percent_ercc" %in% colnames(sc_cell_metadata)) cell_qc_features = c(cell_qc_features, "percent_ercc")
cell_qc_features = values_to_names(cell_qc_features)
# Get filter thresholds per QC metrics
cell_qc_thresholds = purrr::map(cell_qc_features, function(m) {
tresh = purrr::map_dfr(names(param$cell_filter), function(n) {
if (m %in% names(param$cell_filter[[n]])) {
t = data.frame(orig.ident=n, min=param$cell_filter[[n]][[m]][1], max=param$cell_filter[[n]][[m]][2]) %>%
tidyr::pivot_longer(cols=2:3, names_to=c("threshold")) %>%
dplyr::filter(!is.na(value))
t$threshold = factor(t$threshold, levels=c("min", "max"))
return(t)
}
})
})
# Now create plot per QC metric
p_list = list()
for (m in cell_qc_features) {
p_list[[m]]= ggplot(sc_cell_metadata[, c("orig.ident", m)], aes_string(x="orig.ident", y=m, fill="orig.ident", group="orig.ident")) +
geom_violin(scale="width")
# Adds points for samples with less than three cells since geom_violin does not work here
p_list[[m]] = p_list[[m]] +
geom_point(data=sc_cell_metadata[, c("orig.ident", m)] %>% dplyr::filter(orig.ident %in% names(which(table(sc_cell_metadata$orig.ident) < 3))), aes_string(x="orig.ident", y=m, fill="orig.ident"), shape=21, size=2)
# Now add style
p_list[[m]] = p_list[[m]] +
AddStyle(title=m, legend_position="none", fill=param$col_samples, xlab="") +
theme(axis.text.x=element_text(angle=45, hjust=1))
# Add filter threshold as segments to plot; min threshold lines are dashed and max threshold lines are twodashed
if (nrow(cell_qc_thresholds[[m]]) > 0) {
p_list[[m]] = p_list[[m]] + geom_segment(data=cell_qc_thresholds[[m]],
aes(x=as.integer(as.factor(orig.ident))-0.5,
xend=as.integer(as.factor(orig.ident))+0.5,
y=value, yend=value, lty=threshold), colour="firebrick") +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min")))
}
}
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Distribution of feature values")
p# Correlate QC metrics for cells
p_list = list()
sc_cell_metadata_plot_order = sample(1:nrow(sc_cell_metadata))
# nFeature vs nCount
m = paste0(c("nCount_", "nFeature_"), param$assay_raw)
p_list[[1]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes_string(x=m[1], y=m[2], colour="orig.ident")) +
geom_point() +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
p_list[[1]] = p_list[[1]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
p_list[[1]] = p_list[[1]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
# nFeature vs percent_mt
m = c("percent_mt", paste0(c("nFeature_"), param$assay_raw))
p_list[[2]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes_string(x=m[1], y=m[2], colour="orig.ident")) +
geom_point() +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
p_list[[2]] = p_list[[2]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
p_list[[2]] = p_list[[2]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
# nFeature vs percent_ercc (if available)
if ("percent_ercc" %in% names(cell_qc_features)) {
m = c("percent_ercc", paste0(c("nFeature_"), param$assay_raw))
p_list[[3]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes_string(x=m[1], y=m[2], colour="orig.ident")) +
geom_point() +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
p_list[[3]] = p_list[[3]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
p_list[[3]] = p_list[[3]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
}
# Combine plots
p = patchwork::wrap_plots(p_list, ncol=length(p_list)) + patchwork::plot_annotation("Features plotted against each other")
if (length(p_list) == 1) {
p = p & theme(legend.position="bottom")
} else {
p = p + patchwork::plot_layout(guides="collect") & theme(legend.position="bottom")
}
pWe next investigate whether there are individual genes that are represented by an unusually high number of counts. For each cell, we first calculate the percentage of counts per gene. Subsequently, for each gene, we calculate the median value of these percentages in all cells. Genes with the highest median percentage of counts are plotted below.
# Plot only samples that we intend to keep
sc_names = names(sc)[!(names(sc) %in% param$samples_to_drop)]
genes_highestExpr = lapply(sc_names, function(i) {
top_ten_exp = sc[[i]][[param$assay_raw]][["counts_median"]] %>% dplyr::arrange(dplyr::desc(counts_median)) %>% head(n=10)
return(rownames(top_ten_exp))
}) %>%
unlist() %>%
unique()
genes_highestExpr_counts = purrr::map_dfc(sc[sc_names], .f=function(s) s[[param$assay_raw]][["counts_median"]][genes_highestExpr, ])
genes_highestExpr_counts$gene = genes_highestExpr
genes_highestExpr_counts = genes_highestExpr_counts %>% tidyr::pivot_longer(cols=all_of(sc_names))
genes_highestExpr_counts$name = factor(genes_highestExpr_counts$name, levels=sc_names)
col = GenerateColours(num_colours=length(genes_highestExpr), names=genes_highestExpr, palette="ggsci::pal_simpsons")
p = ggplot(genes_highestExpr_counts, aes(x=name, y=value, col=gene, group=gene)) +
geom_point() +
AddStyle(title="Top 10 highest expressed genes per sample, added into one list",
xlab="Sample", ylab="Median % of raw counts\n per gene in a cell",
legend_position="bottom",
col=col)
if (length(unique(genes_highestExpr_counts$name))>1) p = p + geom_line()
pCells and genes are filtered based on the following thresholds:
cell_filter_lst = param$cell_filter %>% unlist(recursive=FALSE)
is_numeric_filter = purrr::map_lgl(cell_filter_lst, function(f) return(is.numeric(f) & length(f)==2))
# numeric cell filters
if (length(cell_filter_lst[is_numeric_filter]) > 0) {
purrr::invoke(rbind, cell_filter_lst[is_numeric_filter]) %>%
knitr::kable(align="l", caption="Numeric filters applied to cells", col.names=c("Min", "Max")) %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}| Min | Max | |
|---|---|---|
| pbmc_10x.nFeature_RNA | NA | NA |
| pbmc_10x.percent_mt | NA | NA |
| pbmc_smartseq2_sample1.nFeature_RNA | NA | NA |
| pbmc_smartseq2_sample1.percent_mt | NA | NA |
# categorial cell filters
if (length(cell_filter_lst[!is_numeric_filter]) > 0) {
purrr::invoke(rbind, cell_filter_lst[!is_numeric_filter] %>% purrr::map(paste, collapse=",")) %>%
knitr::kable(align="l", caption="Categorial filters applied to cells", col.names=c("Values")) %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}
# gene filters
feature_filter_lst = param$feature_filter %>% unlist(recursive=FALSE)
if (length(feature_filter_lst) > 0) {
purrr::invoke(rbind, feature_filter_lst) %>%
knitr::kable(align="l", caption="Filters applied to genes", col.names=c("Value")) %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}| Value | |
|---|---|
| pbmc_10x.min_counts | 1 |
| pbmc_10x.min_cells | 3 |
| pbmc_smartseq2_sample1.min_counts | 1 |
| pbmc_smartseq2_sample1.min_cells | 3 |
The number of excluded cells and features is as follows:
# Iterate over datasets and filters
# Record a cell if it does not pass the filter
# Also record a cell if it belongs to a sample that should be dropped
sc_cells_to_exclude = purrr::map(list_names(sc), function(n) {
filter_result = purrr::map(list_names(param$cell_filter[[n]]), function(f) {
filter = param$cell_filter[[n]][[f]]
if (is.numeric(filter)) {
if (is.na(filter[1])) filter[1] = -Inf # Minimum
if (is.na(filter[2])) filter[2] = Inf # Maximum
idx_exclude = sc[[n]][[f, drop=TRUE]] < filter[1] | sc[[n]][[f, drop=TRUE]] > filter[2]
return(names(which(idx_exclude)))
} else if (is.character(filter)) {
idx_exclude = !sc[[n]][[f, drop=TRUE]] %in% filter
return(Cells(sc[[n]])[idx_exclude])
}
})
# Samples to drop
if (n %in% param$samples_to_drop) {
filter_result[["samples_to_drop"]] = colnames(sc[[n]])
} else {
filter_result[["samples_to_drop"]] = as.character(c())
}
# Minimum number of cells for a sample to keep
if (ncol(sc[[n]]) < param$samples_min_cells) {
filter_result[["samples_min_cells"]] = colnames(sc[[n]])
} else {
filter_result[["samples_min_cells"]] = as.character(c())
}
return(filter_result)
})
# Summarise
sc_cells_to_exclude_summary = purrr::map_dfr(sc_cells_to_exclude, function(s) {
return(as.data.frame(purrr::map(s, length)))
})
rownames(sc_cells_to_exclude_summary) = names(sc_cells_to_exclude)
sc_cells_to_exclude_summary$Original = purrr::map_int(sc, ncol)
sc_cells_to_exclude_summary$Excluded = purrr::map_int(sc_cells_to_exclude, function(s) { return(purrr::flatten(s) %>% unique() %>% length())})
sc_cells_to_exclude_summary$PercKept = round((sc_cells_to_exclude_summary$Original - sc_cells_to_exclude_summary$Excluded) / sc_cells_to_exclude_summary$Original * 100, 2)
knitr::kable(sc_cells_to_exclude_summary,
align="l",
caption="Summary of excluded cells") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))| nFeature_RNA | percent_mt | samples_to_drop | samples_min_cells | Original | Excluded | PercKept | |
|---|---|---|---|---|---|---|---|
| pbmc_10x | 0 | 0 | 0 | 0 | 100 | 0 | 100 |
| pbmc_smartseq2_sample1 | 0 | 0 | 0 | 0 | 100 | 0 | 100 |
# Add filter column to sc_cell_metadata for post-filtering QC
sc_cell_metadata$IS_FILTERED = rownames(sc_cell_metadata) %in% unlist(sc_cells_to_exclude)
# Now filter, drop the respective colours and adjust integration method
sc = purrr::map(list_names(sc), function(n) {
cells_to_keep = Cells(sc[[n]])
cells_to_keep = cells_to_keep[!cells_to_keep %in% purrr::flatten_chr(sc_cells_to_exclude[[n]])]
if (length(cells_to_keep)==0) return(NULL)
else return(subset(sc[[n]], cells=cells_to_keep))
}) %>% purrr::discard(is.null)
if (length(sc)==1) param$integrate_samples[["method"]] = "single"# Only RNA assay at the moment
# Iterate over samples and record a feature if it does not pass the filter
sc_features_to_exclude = purrr::map(list_names(sc), function(n) {
# Make sure the sample contains more cells than the minimum threshold
if (length(Cells(sc[[n]])) < param$feature_filter[[n]][["min_cells"]]) return(list())
# Return gene names that do not pass the minimum threshold
else return(names(which(sc[[n]][[param$assay_raw]][["num_cells_expr_threshold", drop=TRUE]] < param$feature_filter[[n]][["min_cells"]])))
})
# Which genes are to be filtered for all samples?
# Note: Make sure that no other sample is called "AllSamples"
sc_features_to_exclude$AllSamples = Reduce(f=intersect, x=sc_features_to_exclude)
# Summarise
sc_features_to_exclude_summary = purrr::map_dfr(names(sc), function(n){
df = data.frame(Original=nrow(sc[[n]]),
FailThreshold=length(sc_features_to_exclude[[n]]))
df$PercFailThreshold = round(df$FailThreshold / df$Original * 100, 2)
df$Kept = length(setdiff(rownames(sc[[n]]), sc_features_to_exclude[["AllSamples"]]))
df$PercKept = round(df$Kept / df$Original * 100, 2)
return(df)
})
rownames(sc_features_to_exclude_summary) = names(sc)
knitr::kable(sc_features_to_exclude_summary, align="l", caption="Summary of excluded genes") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))| Original | FailThreshold | PercFailThreshold | Kept | PercKept | |
|---|---|---|---|---|---|
| pbmc_10x | 33694 | 23929 | 71.02 | 14388 | 42.7 |
| pbmc_smartseq2_sample1 | 33694 | 19906 | 59.08 | 14388 | 42.7 |
# Now filter those genes that are to be filtered for all samples
sc = purrr::map(list_names(sc), function(n) {
assay_names = Seurat::Assays(sc[[n]])
features_to_keep = purrr::map(values_to_names(assay_names), function(a) {
features = rownames(sc[[n]][[a]])
keep = features[!features %in% sc_features_to_exclude$AllSamples]
return(keep)
})
return(subset(sc[[n]], features=purrr::flatten_chr(features_to_keep)))
})After filtering, the size of the Seurat object is:
sc## $pbmc_10x
## An object of class Seurat
## 14388 features across 100 samples within 1 assay
## Active assay: RNA (14388 features, 0 variable features)
##
## $pbmc_smartseq2_sample1
## An object of class Seurat
## 14388 features across 100 samples within 1 assay
## Active assay: RNA (14388 features, 0 variable features)
The updated QC plots are:
# Now create plot per QC metric after filtering
p_list = list()
for (m in cell_qc_features) {
p_list[[m]] = ggplot(sc_cell_metadata[, c("orig.ident", m, "IS_FILTERED")] %>% dplyr::filter(!IS_FILTERED), aes_string(x="orig.ident", y=m, fill="orig.ident", group="orig.ident")) +
geom_violin(scale="width")
# Adds points for samples with less than three cells since geom_violin does not work here
p_list[[m]] = p_list[[m]] +
geom_point(data=sc_cell_metadata[, c("orig.ident", m, "IS_FILTERED")] %>% dplyr::filter(!IS_FILTERED, orig.ident %in% names(which(table(sc_cell_metadata$orig.ident) < 3))), aes_string(x="orig.ident", y=m, fill="orig.ident"), shape=21, size=2)
# Now add style
p_list[[m]] = p_list[[m]] +
AddStyle(title=m, legend_position="none", fill=param$col_samples, xlab="") +
theme(axis.text.x=element_text(angle=45, hjust=1))
# Add filter threshold as segments to plot; min threshold lines are dashed and max threshold lines are twodashed
if (nrow(cell_qc_thresholds[[m]]) > 0) {
sample_names = sc_cell_metadata[, c("orig.ident", m, "IS_FILTERED")] %>% dplyr::filter(!IS_FILTERED) %>% dplyr::pull(orig.ident) %>% unique()
p_list[[m]] = p_list[[m]] + geom_segment(data=cell_qc_thresholds[[m]] %>% dplyr::filter(orig.ident %in% sample_names),
aes(x=as.integer(as.factor(orig.ident))-0.5,
xend=as.integer(as.factor(orig.ident))+0.5,
y=value, yend=value, lty=threshold), colour="firebrick") +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min")))
}
}
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Distribution of feature values")
p# Correlate QC metrics for cells
p_list = list()
sc_cell_metadata_plot_order = sample(1:nrow(sc_cell_metadata))
# nFeature vs nCount
m = paste0(c("nCount_", "nFeature_"), param$assay_raw)
p_list[[1]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes_string(x=m[1], y=m[2], colour="orig.ident", alpha="IS_FILTERED")) +
geom_point() +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
scale_alpha_manual(values=setNames(c(1, 0.1), c(FALSE, TRUE))) +
AddStyle(col=param$col_samples) +
guides(alpha = "none")
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
p_list[[1]] = p_list[[1]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
p_list[[1]] = p_list[[1]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
# nFeature vs percent_mt
m = c("percent_mt", paste0(c("nFeature_"), param$assay_raw))
p_list[[2]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes_string(x=m[1], y=m[2], colour="orig.ident", alpha="IS_FILTERED")) +
geom_point() +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
scale_alpha_manual(values=setNames(c(1, 0.1), c(FALSE, TRUE))) +
AddStyle(col=param$col_samples) +
guides(alpha = "none")
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
p_list[[2]] = p_list[[2]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
p_list[[2]] = p_list[[2]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
# nFeature vs percent_ercc
if ("percent_ercc" %in% names(cell_qc_features)) {
m = c("percent_ercc", paste0(c("nFeature_"), param$assay_raw))
p_list[[3]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes_string(x=m[1], y=m[2], colour="orig.ident", alpha="IS_FILTERED")) +
geom_point() +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
scale_alpha_manual(values=setNames(c(1, 0.1), c(FALSE, TRUE))) +
AddStyle(col=param$col_samples) +
guides(alpha = "none")
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
p_list[[3]] = p_list[[3]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
p_list[[3]] = p_list[[3]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
}
# Combine plots
p = patchwork::wrap_plots(p_list, ncol=length(p_list)) + patchwork::plot_annotation("Features plotted against each other")
if (length(p_list)==1) {
p = p & theme(legend.position="bottom")
} else {
p = p + patchwork::plot_layout(guides="collect") & theme(legend.position="bottom")
}
p# Remove filtered cells from meta data
sc_cell_metadata = sc_cell_metadata %>% dplyr::filter(IS_FILTERED==FALSE)
# Update levels but make sure level order stays the same
idx.factors = sapply(sc_cell_metadata, is.factor) %>% which()
for (n in colnames(sc_cell_metadata[idx.factors])) {
levels_old = sc_cell_metadata %>% dplyr::pull(n) %>% levels()
levels_new = sc_cell_metadata %>% dplyr::pull(n) %>% as.character() %>% unique()
sc_cell_metadata[[n]] = factor(sc_cell_metadata[[n]], levels=levels_old[levels_old %in% levels_new])
}
# Update actual colors as well, as they will appear in the plots otherwise
param$col_samples = param$col_samples[names(param$col_samples) %in% names(sc)]In this section, we subsequently run a series of Seurat functions for each provided sample:
The number of raw sequencing reads per cell is influenced by technical differences in the capture, reverse transcription and sequencing of RNA molecules, particularly due to the difficulty of achieving consistent library preparation with minimal starting material. Thus, comparing gene expression between cells may reveal differences that are solely due to sampling effects. After low-quality cells were removed in the previous step, the primary goal of normalization is to remove technical sampling effects while preserving the true biological signal.
Count depth scaling is the simplest and most commonly used normalization strategy. The underlying idea is that each cell initially contained an equal number of mRNA molecules, and differences arise due to sampling effects. For each cell, the number of reads per gene is divided by a cell-specific “size factor,” which is proportional to the total count depth of the cell. The resulting normalized data add up to 1 per cell, and is then typically multiplied by a factor of 10 (10,000 in this workflow).
Finally, normalized data are log-transformed for three important reasons. First, distances between log-transformed expression data represent log fold changes. Log-transformation emphasizes contributions from genes with strong relative differences, for example a gene that is expressed at an average count of 50 in cell type A and 10 in cell type B rather than a gene that is expressed at an average count of 1100 in A and 1000 in B. Second, log-transformation mitigates the relation between mean and variance in the data. Lastly, log-transformation reduces that skewness of the data as many downstream analyses require the data to be normally distributed.Normalisation method used for this report: RNA; with additional sources of variance regressed out: .
While raw data is typically used for statistical tests such as finding marker genes, normalised data is mainly used for visualising gene expression values. Scaled data include variable genes only, potentially without cell cycle effects, and are mainly used to determine the structure of the dataset(s) with Principal Component Analysis, and indirectly to cluster and visualise cells in 2D space.
# Normalise data the original way
# This is required to score cell cycle (https://github.com/satijalab/seurat/issues/1679)
sc = purrr::map(sc, Seurat::NormalizeData, normalization.method="LogNormalize", scale.factor=10000, verbose=FALSE)# Determine cell cycle effect per sample
sc = purrr::map(list_names(sc), function(n) {
sc[[n]] = CCScoring(sc=sc[[n]], genes_s=genes_s[,2], genes_g2m=genes_g2m[,2], name=n)
if (any(is.na(sc[[n]][["S.Score"]])) | any(is.na(sc[[n]][["G2M.Score"]]))) {
param$cc_remove=FALSE
param$cc_remove_all=FALSE
param$cc_rescore_after_merge=FALSE
}
return(sc[[n]])
})
# If cell cycle effects should be removed, we first score cells
# The effect is then removed in the following chunk
if (param$cc_remove) {
# Add to vars that need to regressed out during normalisation
if (param$cc_remove_all) {
# Remove all signal associated to cell cycle
param$vars_to_regress = unique(c(param$vars_to_regress, "S.Score", "G2M.Score"))
param$latent_vars = unique(c(param$latent_vars, "S.Score", "G2M.Score"))
} else {
# Don't remove the difference between cycling and non-cycling cells
param$vars_to_regress = unique(c(param$vars_to_regress, "CC.Difference"))
param$latent_vars = unique(c(param$latent_vars, "CC.Difference"))
}
}if (param$norm == "RNA") {
# Find variable features from normalised data (unaffected by scaling)
sc = purrr::map(sc, Seurat::FindVariableFeatures, selection.method="vst", nfeatures=3000, verbose=FALSE)
# Scale
# Note: For a single dataset where no integration/merging is needed, all features can already be scaled here.
# Otherwise, scaling of all features will be done after integration/merging.
if (param$integrate_samples[["method"]]=="single") {
sc[[1]] = Seurat::ScaleData(sc[[1]],
features=rownames(sc[[1]][["RNA"]]),
vars.to.regress=param$vars_to_regress,
verbose=FALSE)
}
} else if (param$norm == "SCT") {
# Run SCTransform
#
# This is a new normalisation method that replaces previous Seurat functions "NormalizeData", "FindVariableFeatures", and "ScaleData".
# vignette: https://satijalab.org/seurat/v3.0/sctransform_vignette.html
# paper: https://www.biorxiv.org/content/10.1101/576827v2
# Normalised data end up here: sc@assays$SCT@data
# Note: For a single dataset where no integration is needed, all features can already be scaled here.
# Otherwise, it is enough to scale only the variable features.
# Note: It is not guaranteed that all genes are successfully normalised with SCTransform.
# Consequently, some genes might be missing from the SCT assay.
# See: https://github.com/ChristophH/sctransform/issues/27
# Note: The performance of SCTransform can be improved by using "glmGamPoi" instead of "poisson" as method for initial parameter estimation.
sc = purrr::map(list_names(sc), function(n) {
SCTransform(sc[[n]],
assay=param$assay_raw,
vars.to.regress=param$vars_to_regress,
min_cells=param$feature_filter[[n]][["min_cells"]],
verbose=FALSE,
return.only.var.genes=!(param$integrate_samples[["method"]]=="single"),
method=ifelse(packages_installed("glmGamPoi"), "glmGamPoi", "poisson"))
})
}Experience shows that 1,000-2,000 genes with the highest cell-to-cell variation are often sufficient to describe the global structure of a single-cell dataset. For example, cell type-specific genes typically highly vary between cells. Housekeeping genes, on the other hand, are similarly expressed across cells and can be disregarded to differentiate between cells. Highly variable genes are typically the genes with a cell type specific expression profile, and are often the genes of interest in single-cell experiments. Housekeeping genes, with similar levels of expression across all cells, or genes with minor expression differences, might add random noise and mask relevant changes during downstream dimensionality reduction and clustering. We therefore aim to select a sensible set of variable genes that includes interesting biological signal and excludes noise. Here, the top 3,000 variable genes are selected and used for downstream analysis.
fig_height_vf = 5 * ceiling(length(names(sc))/2)p_list = purrr::map(list_names(sc), function(n) {
top10 = head(Seurat::VariableFeatures(sc[[n]], assay=ifelse(param$norm=="SCT", param$norm, param$assay_raw)), 10)
p = Seurat::VariableFeaturePlot(sc[[n]],
assay=ifelse(param$norm=="SCT", param$norm, param$assay_raw),
selection.method=ifelse(param$norm=="RNA", "vst", "sct"),
col=c("grey", param$col)) +
AddStyle(title=n) +
theme(legend.position=c(0.2, 0.8), legend.background=element_rect(fill=alpha("white", 0.0)))
p = LabelPoints(plot=p, points=top10, repel=TRUE, xnudge=0, ynudge=0)
return(p)
})
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Variable genes")
pif (param$integrate_samples[["method"]]!="single") {
# When merging, feature meta-data is removed by Seurat entirely; save separately for each assay except for SCT and add again afterwards
# Note: not sure whether this is still needed - discuss
assay_names = setdiff(unique(purrr::flatten_chr(purrr::map(list_names(sc), function(n) { Seurat::Assays(sc[[n]]) } ))), "SCT")
# Loop through all assays and accumulate meta data
sc_feature_metadata = purrr::map(values_to_names(assay_names), function(a) {
# "feature_id", "feature_name", "feature_type" are accumulated for all assays and stored just once
# This step is skipped for assays that do not contain all three types of feature information
contains_neccessary_columns = purrr::map_lgl(list_names(sc), function(n) {
all(c("feature_id", "feature_name", "feature_type") %in% colnames(sc[[n]][[a]][[]]))
})
if (all(contains_neccessary_columns)) {
feature_id_name_type = purrr::map(sc, function(s) return(s[[a]][[c("feature_id", "feature_name", "feature_type")]]) )
feature_id_name_type = purrr::reduce(feature_id_name_type, function(df_x, df_y) {
new_rows = which(!rownames(df_y) %in% rownames(df_x))
if (length(new_rows) > 0) return(rbind(df_x, df_y[new_rows, ]))
else return(df_x)
})
feature_id_name_type$row_names = rownames(feature_id_name_type)
} else {
feature_id_name_type = NULL
}
# For all other meta-data, we prefix column names with the dataset
other_feature_data = purrr::map(list_names(sc), function(n) {
df = sc[[n]][[a]][[]]
if (contains_neccessary_columns[[n]]) df = df %>% dplyr::select(-dplyr::one_of(c("feature_id", "feature_name", "feature_type"), c()))
if (ncol(df) > 0) colnames(df) = paste(n, colnames(df), sep=".")
df$row_names = rownames(df)
return(df)
})
# Now join everything by row_names by full outer join
if (!is.null(feature_id_name_type)) {
feature_data = purrr::reduce(c(list(feature_id_name_type=feature_id_name_type), other_feature_data), dplyr::full_join, by="row_names")
} else {
feature_data = purrr::reduce(other_feature_data, dplyr::full_join, by="row_names")
}
rownames(feature_data) = feature_data$row_names
feature_data$row_names = NULL
return(feature_data)
})
# When merging, cell meta-data are merged but factors are not kept
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s){ s[[]] }) %>% as.data.frame())
sc_cell_metadata_factor_levels = purrr::map(which(sapply(sc_cell_metadata, is.factor)), function(n) {
return(levels(sc_cell_metadata[, n, drop=TRUE]))
})
}# Standard method for integrating multiple samples. Best performance but computationally intensive.
if (param$integrate_samples[["method"]]=="integrate") {
if (!"use_reciprocal_pca" %in% names(param$integrate_samples)) param$integrate_samples[["use_reciprocal_pca"]] = FALSE
# Run integration
sc = RunIntegration(sc,
assay=param$norm,
ndims=param$integrate_samples[["dimensions"]],
verbose=FALSE,
reference=param$integrate_samples[["reference"]],
use_reciprocal_pca=param$integrate_samples[["use_reciprocal_pca"]],
k_filter=param$integrate_samples[["k.filter"]],
k_weight=param$integrate_samples[["k.weight"]],
k_anchor=param$integrate_samples[["k.anchor"]],
k_score=param$integrate_samples[["k.score"]])
# Re-score cell cycle effects after integration
if (param$cc_rescore_after_merge) {
sc = CCScoring(sc=sc, genes_s=genes_s[,2], genes_g2m=genes_g2m[,2])
if (any(is.na(sc[["S.Score"]])) | any(is.na(sc[["G2M.Score"]]))) {
param$cc_remove=FALSE
param$cc_remove_all=FALSE
param$cc_rescore_after_merge=FALSE
param$vars_to_regress = setdiff(param$vars_to_regress, c("S.Score", "G2M.Score", "CC.Difference"))
param$latent_vars = setdiff(param$latent_vars, c("S.Score", "G2M.Score", "CC.Difference"))
}
}
# (Re-)Run scaling
if (param$norm == "RNA") {
# According to Seurat, we need to scale data again for "RNAintegrated", and "RNA"
DefaultAssay(sc) = "RNAintegrated"
sc = Seurat::ScaleData(sc,
features=rownames(sc[["RNAintegrated"]]),
vars.to.regress=param$vars_to_regress,
assay="RNAintegrated",
verbose=FALSE)
DefaultAssay(sc) = "RNA"
sc = Seurat::ScaleData(sc,
features=rownames(sc[["RNA"]]),
vars.to.regress=param$vars_to_regress,
assay="RNA",
verbose=FALSE)
} else if (param$norm == "SCT") {
# We need to re-run SCTransform for the "SCT" assay again, to normalise on the complete dataset
DefaultAssay(sc) = "SCT"
min_cells_overall = max(purrr::map_int(param$feature_filter, function(f) as.integer(f[["min_cells"]])))
sc = SCTransform(sc,
assay=param$assay_raw,
vars.to.regress=param$vars_to_regress,
min_cells=min_cells_overall,
verbose=FALSE,
return.only.var.genes=FALSE,
method=ifelse(packages_installed("glmGamPoi"), "glmGamPoi", "poisson"))
}
# Add feature metadata
for (a in Seurat::Assays(sc)) {
if (a %in% names(sc_feature_metadata)) {
sc[[a]] = Seurat::AddMetaData(sc[[a]], sc_feature_metadata[[a]][rownames(sc[[a]]),, drop=FALSE])
}
}
# Fix cell metadata factors
for (f in names(sc_cell_metadata_factor_levels)) {
sc[[f]] = factor(sc[[f, drop=TRUE]], levels=sc_cell_metadata_factor_levels[[f]])
}
# Add sample colours again to misc slot
sc = ScAddLists(sc, lists=list(orig.ident=param$col_samples), lists_slot="colour_lists")
# Set default assay (will be the integrated version)
DefaultAssay(sc) = paste0(param$norm, "integrated")
message("Data values for all samples have been integrated.")
print(sc)
# Add sample as latent_vars for marker detection
param$latent_vars = c(param$latent_vars, "orig.ident")
}× (Message)
Data values for all samples have been integrated.
## An object of class Seurat
## 17388 features across 200 samples within 2 assays
## Active assay: RNAintegrated (3000 features, 3000 variable features)
## 1 other assay present: RNA
n_cells_rle_plot = 100
# Sample at most 100 cells per dataset and save their identity
cells_subset = sc[["orig.ident"]] %>% tibble::rownames_to_column() %>%
dplyr::group_by(orig.ident) %>%
dplyr::sample_n(size=min(n_cells_rle_plot, length(orig.ident))) %>%
dplyr::select(rowname, orig.ident)To better understand the efficiency of the applied normalisation procedures, we plot the relative log expression of genes in at most 100 randomly selected cells per sample before and after normalisation. This type of plot reveals unwanted variation in your data. The concept is taken from Gandolfo and Speed (2018). In brief, we remove variation between genes, leaving only variation between samples. If expression levels of most genes are similar in all cell types, sample heterogeneity is a sign of unwanted variation.
For each gene, we calculate its median expression across all cells, and then calculate the deviation from this median for each cell. For each cell, we plot the median expression (black), the interquartile range (lightgrey), whiskers defined as 1.5 times the interquartile range (darkgrey), and outliers (#374E55FF, #DF8F44FF)
# Plot raw data
p = PlotRLE(as.matrix(log2(GetAssayData(subset(sc, cells=cells_subset$rowname), assay=param$assay_raw, slot="counts") + 1)),
id=cells_subset$orig.ident,
col=param$col_samples) +
labs(title="log2(raw counts + 1)")
pDependent on the context, this tab refers to different data:
# Plot normalised data
p = PlotRLE(as.matrix(GetAssayData(subset(sc, cells=cells_subset$rowname), assay=ifelse(param$norm=="SCT", param$norm, param$assay_raw), slot="data")),
id=cells_subset$orig.ident,
col=param$col_samples) +
labs(title="Normalised data")
pif (! param$integrate_samples[["method"]] %in% c("single", "merge")) {
# Plot integrated data
p = PlotRLE(as.matrix(GetAssayData(subset(sc, cells=cells_subset$rowname), assay=paste0(param$norm, "integrated"), slot="data")),
id=cells_subset$orig.ident,
col=param$col_samples) +
labs(title="Integrated data")
p
} else {
message("No integrated data available.")
}A single-cell dataset of 20,000 genes and 5,000 cells has 20,000 dimensions. At this point of the analysis, we have already reduced the dimensionality of the dataset to 3,000 variable genes. The biological manifold however can be described by far fewer dimensions than the number of (variable) genes, since expression profiles of different genes are correlated if they are involved in the same biological process. Dimension reduction methods aim to find these dimensions. There are two general purposes for dimension reduction methods: to summarize a dataset, and to visualize a dataset.
We use Principal Component Analysis (PCA) to summarize a dataset, overcoming noise and reducing the data to its essential components. Later, we use Uniform Manifold Approximation and Projection (UMAP) to visualize the dataset, placing similar cells together in 2D space, see below.
Principal Component Analysis is a way to summarize a dataset and to reduce noise by averaging similar gene expression profiles. The information for correlated genes is compressed into single dimensions called principal components (PCs) and the analysis identifies those dimensions that capture the largest amount of variation. This process gives a more precise representation of the patterns inherent to complex and large datasets.
In a PCA, the first PC captures the greatest variance across the whole dataset. The next PC captures the greatest remaining amount of variance, and so on. This way, the top PCs are likely to represent the biological signal where multiple genes are affected by the same biological processes in a coordinated way. In contrast, random technical or biological noise that affects each gene independently are contained in later PCs. Downstream analyses can be restricted to the top PCs to focus on the most relevant biological signal and to reduce noise and unnecessary computational overhead.To decide how many PCs to include in downstream analyses, we visualise the cells and genes that define the PCA.
# Run PCA for default normalisation
sc = Seurat::RunPCA(sc, features=Seurat::VariableFeatures(object=sc), verbose=FALSE, npcs=min(50, ncol(sc)))p_list = Seurat::VizDimLoadings(sc, dims=1:2, reduction="pca", col=param$col, combine=FALSE, balanced=TRUE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle()
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Top gene loadings of the first two PCs")
pp = Seurat::DimPlot(sc, reduction="pca", cols=param$col_samples) +
AddStyle(title="Cells arranged by the first two PCs", legend_position="bottom")
pp_list = Seurat::DimHeatmap(sc, dims=1:min(20, ncol(sc)), cells=min(500, ncol(sc)), balanced=TRUE, fast=FALSE, combine=FALSE)
p_list = purrr::map(seq(p_list), function(i) {
p_list[[i]] = p_list[[i]] +
ggtitle(paste("PC", i)) +
theme(legend.position="none", axis.text.y=element_text(size=8))
return(p_list[[i]])
})
p = patchwork::wrap_plots(p_list, ncol=3) + patchwork::plot_annotation("Top gene loadings of the first PCs")
pWe next need to decide how many PCs we want to use for our analyses. PCs include biological signal as well as noise, and we need to determine the number of PCs with which we include as much biological signal as possible and as little noise as possible. The following “Elbow plot” is designed to help us make an informed decision. It shows PCs ranked based on the percentage of variance they explain.
# More approximate technique used to reduce computation time
p = Seurat::ElbowPlot(sc, ndims=min(20, ncol(sc))) +
geom_vline(xintercept=param$pc_n + .5, col="firebrick", lty=2) +
AddStyle(title="Elbow plot")
p# Cannot have more PCs than number of cells
param$pc_n = min(param$pc_n, ncol(sc))For the current dataset, 10 PCs were chosen.
Seurat’s clustering method first constructs a graph structure, where nodes are cells and edges are drawn between cells with similar gene expression patterns. Technically speaking, Seurat first constructs a K-nearest neighbor (KNN) graph based on Euclidean distance in PCA space, and refines edge weights between cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To partition the graph into highly interconnected parts, cells are iteratively grouped together using the Leiden algorithm (Traag, Waltman, and van Eck 2019).
At this point, we would like to define subpopulations of cells with similar gene expression profiles using unsupervised clustering. Clusters ultimately serve as approximations for biological objects like cell types or cell states.
During the first step of clustering, a K-nearest neighbor (KNN) graph is constructed. In simplified terms this means that cells are connected to their K nearest neighbors based on cell-to-cell expression similarity using the PCs chosen in the previous step. The higher the similarity is, the higher the edge weight becomes. During the second step, the graph is partitioned into highly interconnected communities, whereby each community represents a cluster of cells with similar expression profiles. The separation of the graph into clusters is dependent on the chosen resolution. For scRNA-seq datasets of around 3000 cells, it is recommended to use a resolution value between 0.4 and 1.2. This value can be set even higher for larger datasets. Note that the choice of PCs and cluster resolution is an arbitrary one. Therefore, it is highly recommended to evaluate clusters and re-run the workflow with adapted parameters if needed.To get a first idea about how different cluster resolution values influence the clustering, we run and visualize the clustering multiple times, see below. For this report, you chose the resolution value 1 as the final value for further analyses.
cluster_resolutions = sort(unique(c(param$cluster_resolution, param$cluster_resolution_test)))
# Construct phylogenetic tree relating the "average" cell from each sample
if (length(levels(sc$orig.ident)) > 1) {
sc = suppressWarnings(Seurat::BuildClusterTree(sc, features=rownames(sc), verbose=FALSE))
Seurat::Misc(sc, "trees") = list(orig.ident = Seurat::Tool(sc, "Seurat::BuildClusterTree"))
}
# The number of clusters can be optimized by tuning "resolution" -> based on feedback from the client whether or not clusters make sense
# Choose the number of PCs to use for clustering
sc = Seurat::FindNeighbors(sc, dims=1:param$pc_n, verbose=FALSE, k.param=param$cluster_k)
# Seurat vignette suggests resolution parameter between 0.4-1.2 for datasets of about 3k cells
# But we can run multiple resolutions if requested
sc = Seurat::FindClusters(sc, resolution=cluster_resolutions, algorithm=4, verbose=FALSE, method="igraph")
# Construct phylogenetic tree relating the "average" cell from each cluster for each clustering
# Also add colour lists for each clustering
for(r in cluster_resolutions) {
n = paste0(DefaultAssay(sc), "_snn_res.", r)
# Tree
if (length(levels(sc[[n, drop=TRUE]])) > 1) {
Seurat::Idents(sc) = n
sc = suppressWarnings(Seurat::BuildClusterTree(sc, dims=1:param$pc_n, verbose=FALSE))
l = list(Seurat::Tool(sc, "Seurat::BuildClusterTree"))
names(l) = n
suppressWarnings({Seurat::Misc(sc, "trees") = c(Seurat::Misc(sc, "trees"), l)})
}
col = GenerateColours(num_colours=length(levels(sc[[n, drop=TRUE]])), names=levels(sc[[n, drop=TRUE]]),
palette=param$col_palette_clusters, alphas=1)
# Colours
l = list(col)
names(l) = n
sc = ScAddLists(sc, lists=l, lists_slot="colour_lists")
}
# Set default clustering
n = paste0(DefaultAssay(sc), "_snn_res.", param$cluster_resolution)
sc$seurat_clusters = sc[[n, drop=TRUE]]
Seurat::Idents(sc) = sc$seurat_clusters
if (length(levels(sc$seurat_clusters)) > 1) {
suppressWarnings({Seurat::Misc(sc, "trees") = c(Seurat::Misc(sc, "trees"), list(seurat_clusters = Seurat::Misc(sc, "trees")[[n]]))})
}
# Set up colors for default clustering
sc = ScAddLists(sc, lists=list(seurat_clusters=Misc(sc, "colour_lists")[[n]]), lists_slot="colour_lists")
param$col_clusters = Misc(sc, "colour_lists")[["seurat_clusters"]]
# Define height of test clusters
height_per_row = 3
nr_rows = ceiling(length(cluster_resolutions)/2)
fig_height_test_clusters = nr_rows * height_per_row# Default UMAP
sc = suppressWarnings(Seurat::RunUMAP(sc, dims=1:param$pc_n, verbose=FALSE, umap.method="uwot", n.neighbors=param$umap_k))
# If there are any test resolutions other than the default, go ahead
if (length(cluster_resolutions) > 1) {
p_list = list()
for(r in cluster_resolutions) {
r = as.character(r)
n = paste0(DefaultAssay(sc), "_snn_res.", r)
cluster_cells = table(sc[[n, drop=TRUE]])
cluster_labels = paste0(names(cluster_cells)," (", cluster_cells,")")
p_list[[r]] = Seurat::DimPlot(sc, reduction="umap", group.by=n, pt.size=param$pt_size, label=TRUE) +
scale_color_manual(values=Seurat::Misc(sc, "colour_lists")[[n]], labels=cluster_labels) +
AddStyle(title=r, legend_position="bottom", legend_title="Clusters") +
FontSize(x.text = 10, y.text = 10, x.title = 13, y.title = 13, main = 15)
}
p = patchwork::wrap_plots(p_list, ncol=3) + patchwork::plot_annotation(title="UMAP, cells coloured by cluster identity for different resolution values")
print(p)
}We use a UMAP to visualise and explore a dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the graph-based clustering, and clusters typcially co-localise on the UMAP.
Take care not to mis-read a UMAP:
For a nice read to intuitively understand UMAP, see https://pair-code.github.io/understanding-umap/.
# Note that you can set `label = TRUE` or use the LabelClusters function to help label individual clusters
cluster_cells = table(sc@active.ident)
cluster_labels = paste0(levels(sc@active.ident)," (", cluster_cells[levels(sc@active.ident)],")")
p = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters") +
scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
AddStyle(title="UMAP, cells coloured by cluster identity", legend_position="bottom", legend_title="Clusters")
p = LabelClusters(p, id="seurat_clusters", box=TRUE, fill="white")
p# Plot all samples separately
# Repel will be deactivated if there are more than six samples; otherwise the plot may crash
p = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", pt.size=param$pt_size, split.by = "orig.ident", ncol = 2) +
scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
AddStyle(title="UMAP, cells coloured by cluster identity", legend_position="bottom", legend_title="Clusters")
p = LabelClusters(p, id="seurat_clusters", box=TRUE, fill="white", repel=ifelse(length(unique(sc$orig.ident)) <= 5, TRUE, FALSE))
psample_cells = table(sc$orig.ident)
sample_labels = paste0(levels(sc$orig.ident)," (", sample_cells[levels(sc$orig.ident)],")")
# Note: This is a hack to colour by sample but label by Cluster
p = Seurat::DimPlot(sc, reduction="umap", group.by="orig.ident") +
scale_color_manual(values=param$col_samples, labels=sample_labels) +
AddStyle(title="UMAP, cells coloured by sample of origin", legend_position="bottom")
p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
p = LabelClusters(p, id="seurat_clusters", box=TRUE, segment.color="black", fill="white")
p# Plot all samples separately
# Repel will be deactivated if there are more than six samples; otherwise the plot may crash
p = Seurat::DimPlot(sc, reduction="umap", group.by="orig.ident", pt.size=param$pt_size, split.by = "orig.ident", ncol = 2) +
scale_color_manual(values=param$col_samples, labels=sample_labels) +
AddStyle(title="UMAP, cells coloured by sample of origin", legend_position="bottom")
p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
p = LabelClusters(p, id="seurat_clusters", box=TRUE, segment.color="black", fill="white", repel=ifelse(length(unique(sc$orig.ident)) <= 5, TRUE, FALSE))
p# Count cells per cluster per sample
cell_samples = sc[[]] %>% dplyr::pull(orig.ident) %>% levels()
cell_clusters = sc[[]] %>% dplyr::pull(seurat_clusters) %>% levels()
tbl = dplyr::count(sc[[c("orig.ident", "seurat_clusters")]], orig.ident, seurat_clusters) %>% tidyr::pivot_wider(names_from="seurat_clusters", names_prefix="Cl_", values_from=n, values_fill=0) %>% as.data.frame()
rownames(tbl) = paste0(tbl[,"orig.ident"],"_n")
tbl[,"orig.ident"] = NULL
# Add percentages
tbl_perc = round(t(tbl) / colSums(tbl) * 100, 2) %>% t()
rownames(tbl_perc) = gsub(rownames(tbl_perc), pattern="_n$", replacement="_perc", perl=TRUE)
tbl = rbind(tbl, tbl_perc)
# Add enrichment
if (length(cell_samples) > 1 & length(cell_clusters) > 1) tbl = rbind(tbl, CellsFisher(sc))
# Sort
tbl = tbl[order(rownames(tbl)),, drop=FALSE]
# Plot percentages
tbl_bar = tbl[paste0(cell_samples, "_perc"), , drop=FALSE] %>%
tibble::rownames_to_column(var="Sample") %>%
tidyr::pivot_longer(tidyr::starts_with("Cl"), names_to="Cluster", values_to="Percentage")
tbl_bar$Cluster = tbl_bar$Cluster %>% gsub(pattern="^Cl_", replacement="", perl=TRUE) %>% factor(levels=sc$seurat_clusters %>% levels())
tbl_bar$Sample = tbl_bar$Sample %>% gsub(pattern="_perc$", replacement="", perl=TRUE) %>% as.factor()
tbl_bar$Percentage = as.numeric(tbl_bar$Percentage)
p = ggplot(tbl_bar, aes(x=Cluster, y=Percentage, fill=Sample)) +
geom_bar(stat="identity" ) +
AddStyle(title="Percentage cells of samples in clusters",
fill=param$col_samples,
legend_title="Sample",
legend_position="bottom")
pThe following table shows the number of cells per sample and cluster:
In case the dataset contains 2 or more samples, we also calculate whether or not the number of cells of a sample in a cluster is significantly higher or lower than expected:
# Print table
knitr::kable(tbl, align="l", caption="Number of cells per sample and cluster") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", fixed_thead=TRUE)| Cl_1 | Cl_2 | Cl_3 | Cl_4 | |
|---|---|---|---|---|
| pbmc_10x_n | 29 | 29 | 31 | 11 |
| pbmc_10x_perc | 43.94 | 47.54 | 67.39 | 40.74 |
| pbmc_10x.oddsRatio | 0.7 | 0.87 | 2.53 | 0.65 |
| pbmc_10x.p | 9.1e-01 | 7.3e-01 | 5.6e-03 | 8.9e-01 |
| pbmc_smartseq2_sample1_n | 37 | 32 | 15 | 16 |
| pbmc_smartseq2_sample1_perc | 56.06 | 52.46 | 32.61 | 59.26 |
| pbmc_smartseq2_sample1.oddsRatio | 1.44 | 1.15 | 0.39 | 1.54 |
| pbmc_smartseq2_sample1.p | 1.5e-01 | 3.8e-01 | 1.0e+00 | 2.0e-01 |
# Reset default assay, so we won't plot integrated data
# Note: We need integrated data for UMAP, clusters
DefaultAssay(sc) = ifelse(param$norm=="SCT", param$norm, param$assay_raw)How much do gene expression profiles in the dataset reflect the cell cycle phases the single cells were in? After initial normalisation, we determined the effects of cell cycle heterogeneity by calculating a score for each cell based on its expression of G2M and S phase markers. Scoring is based on the strategy described in Tirosh et al. (2016), and human gene symbols are translated to gene symbols of the species of interest using biomaRt. This section of the report visualises the above calculated cell cycle scores.
# Set up colours for cell cycle effect and add to sc object
col = GenerateColours(num_colours=length(levels(sc$Phase)), names=levels(sc$Phase), palette="ggsci::pal_npg", alphas=1)
sc = ScAddLists(sc, lists=list(Phase=col), lists_slot="colour_lists")
# Get a feeling for how many cells are affected
p1 = ggplot(sc[[]], aes(x=S.Score, y=G2M.Score, colour=Phase)) +
geom_point() +
scale_x_continuous("G1/S score") +
scale_y_continuous("G2/M score") +
AddStyle(col=Misc(sc, "colour_lists")[["Phase"]])
p2 = ggplot(sc@meta.data %>%
dplyr::group_by(seurat_clusters, Phase) %>%
dplyr::summarise(num_cells=length(Phase)),
aes(x=seurat_clusters, y=num_cells, fill=Phase)) +
geom_bar(stat="identity", position="fill") +
scale_x_discrete("Seurat clusters") +
scale_y_continuous("Fraction of cells") +
AddStyle(fill=Misc(sc, "colour_lists")[["Phase"]])
p3 = ggplot(sc[[]] %>%
dplyr::group_by(orig.ident, Phase) %>%
dplyr::summarise(num_cells=length(Phase)),
aes(x=orig.ident, y=num_cells, fill=Phase)) +
geom_bar(stat="identity", position="fill") +
scale_y_continuous("Fraction of cells") +
AddStyle(fill=Misc(sc, "colour_lists")[["Phase"]]) +
theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) + xlab("")
p = p1 + p2 + p3 & theme(legend.position="bottom")
p = p + patchwork::plot_annotation(title="Cell cycle phases") + plot_layout(guides="collect")
pif (any(!is.na(sc$Phase))) {
# UMAP with phases superimposed
# Note: This is a hack to colour by phase but label by Cluster
p = Seurat::DimPlot(sc, group.by="Phase", pt.size=1, cols=Misc(sc, "colour_lists")[["Phase"]]) +
AddStyle(title="UMAP, cells coloured by cell cycle phases", legend_title="Phase")
p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
p = LabelClusters(p, id="seurat_clusters", box=TRUE, fill="white")
p
}if (any(!is.na(sc$Phase))) {
p = Seurat::FeaturePlot(sc, features="S.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col)) +
AddStyle(title="UMAP, cells coloured by S phase")
p = LabelClusters(p, id="ident", box=TRUE, fill="white")
p
}if (any(!is.na(sc$Phase))) {
p = Seurat::FeaturePlot(sc, features="G2M.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col)) +
AddStyle(title="UMAP, cells coloured by G2M phase")
p = LabelClusters(p, id="ident", box=TRUE, fill="white")
p
}if (any(!is.na(sc$Phase))) {
p = Seurat::FeaturePlot(sc, features="CC.Difference", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col)) +
AddStyle(title="UMAP, cells coloured by CC.Difference")
p = LabelClusters(p, id="ident", box=TRUE, fill="white")
p
}Do cells in individual clusters have particularly high counts, detected genes or mitochondrial content?
qc_feature = paste0("nCount_", param$assay_raw)
p1 = suppressMessages(Seurat::FeaturePlot(sc, features=qc_feature) +
AddStyle(title="Feature plot") +
scale_colour_gradient(low="lightgrey", high=param$col, trans="log10"))
p1 = LabelClusters(p1, id="ident", box=FALSE)
p2 = ggplot(sc[[]], aes_string(x="seurat_clusters", y=qc_feature, fill="seurat_clusters", group="seurat_clusters")) +
geom_violin(scale="width") +
AddStyle(title="Violin plot (log10 scale)", fill=param$col_clusters,
xlab="Cluster", legend_position="none") +
scale_y_log10()
p = p1 | p2
p = p + patchwork::plot_annotation(title=paste0("Summed raw counts (", qc_feature, ", log10 scale)"))
pm = paste0(c("nCount_", "nFeature_"), param$assay_raw)
p = ggplot(sc[[c(m, "seurat_clusters")]], aes_string(x=m[1], y=m[2], colour="seurat_clusters")) +
geom_point() +
AddStyle(col=param$col_clusters) +
facet_wrap(~seurat_clusters) +
theme(legend.position = "none")
pqc_feature = paste0("nFeature_", param$assay_raw)
p1 = suppressMessages(Seurat::FeaturePlot(sc, features=qc_feature) +
AddStyle(title="Feature plot") +
scale_colour_gradient(low="lightgrey", high=param$col, trans="log10"))
p1 = LabelClusters(p1, id="ident", box=FALSE)
p2 = ggplot(sc[[]], aes_string(x="seurat_clusters", y=qc_feature, fill="seurat_clusters", group="seurat_clusters")) +
geom_violin(scale="width") +
AddStyle(title="Violin plot", fill=param$col_clusters,
xlab="Cluster", legend_position="none") +
scale_y_log10()
p = p1 | p2
p = p + patchwork::plot_annotation(title=paste0("Number of features with raw count > 0 (", qc_feature, ", log10 scale)"))
pm = paste0(c("nFeature_", "nFeature_"), param$assay_raw)
p = ggplot(sc[[c(m, "seurat_clusters")]], aes_string(x=m[1], y=m[2], colour="seurat_clusters")) +
geom_point() +
AddStyle(col=param$col_clusters) +
facet_wrap(~seurat_clusters) +
theme(legend.position = "none")
pp1 = Seurat::FeaturePlot(sc, features="percent_mt", cols=c("lightgrey", param$col)) +
AddStyle(title="Feature plot")
p1 = LabelClusters(p1, id="ident", box=FALSE)
p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=percent_mt, fill=seurat_clusters, group=seurat_clusters)) +
geom_violin(scale="width") +
AddStyle(title="Violin plot", fill=param$col_clusters,
xlab="Cluster", legend_position="none")
p = p1 | p2
p = p + patchwork::plot_annotation(title="Percent of mitochondrial features (percent_mt)")
pm = c("percent_mt", paste0("nFeature_", param$assay_raw))
p = ggplot(sc[[c(m, "seurat_clusters")]], aes_string(x=m[1], y=m[2], colour="seurat_clusters")) +
geom_point() +
AddStyle(col=param$col_clusters) +
facet_wrap(~seurat_clusters) +
theme(legend.position = "none")
pp1 = Seurat::FeaturePlot(sc, features="percent_ribo", cols=c("lightgrey", param$col)) +
AddStyle(title="Feature plot")
p1 = LabelClusters(p1, id="ident", box=FALSE)
p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=percent_ribo, fill=seurat_clusters, group=seurat_clusters)) +
geom_violin(scale="width") +
AddStyle(title="Violin plot", fill=param$col_clusters,
xlab="Cluster", legend_position="none")
p = p1 | p2
p = p + patchwork::plot_annotation(title="Percent of ribosomal features (percent_ribo)")
pm = c("percent_ribo", paste0("nFeature_", param$assay_raw))
p = ggplot(sc[[c(m, "seurat_clusters")]], aes_string(x=m[1], y=m[2], colour="seurat_clusters")) +
geom_point() +
AddStyle(col=param$col_clusters) +
facet_wrap(~seurat_clusters) +
theme(legend.position = "none")
pif ("percent_ercc" %in% colnames(sc[[]])) {
p1 = Seurat::FeaturePlot(sc, features="percent_ercc", cols=c("lightgrey", param$col)) +
AddStyle(title="Feature plot")
p1 = LabelClusters(p1, id="ident", box=FALSE)
p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=percent_ercc, fill=seurat_clusters, group=seurat_clusters)) +
geom_violin(scale="width") +
AddStyle(title="Violin plot", fill=param$col_clusters,
xlab="Cluster", legend_position="none")
p = p1 | p2
p = p + patchwork::plot_annotation(title="Percent of ERCC features (percent_ercc)")
print(p)
# Plot percent_ercc vs nFeature by cluster
m = c("percent_ercc", paste0("nFeature_", param$assay_raw))
p = ggplot(sc[[c(m, "seurat_clusters")]], aes_string(x=m[1], y=m[2], colour="seurat_clusters")) +
geom_point() +
AddStyle(col=param$col_clusters) +
facet_wrap(~seurat_clusters) +
theme(legend.position = "none")
print(p)
} else {
message("No ERCC controls found.")
}× (Message)
No ERCC controls found.
Do cells in individual clusters express provided known marker genes?
known_markers_list=c()
# Overwrite empty list of known markers
if (!is.null(param$file_known_markers)) {
# Read known marker genes and map to rownames
known_markers = openxlsx::read.xlsx(param$file_known_markers)
known_markers_list = lapply(colnames(known_markers), function(x) {
y = ensembl_to_seurat_rowname[known_markers[,x]] %>%
na.exclude() %>% unique() %>% sort()
m = !y %in% rownames(sc)
if (any(m)){
Warning(paste0("The following genes of marker list '", x, "' cannot be found in the data: ", first_n_elements_to_string(y[m], n=10)))
}
return(y[!m])
})
# Remove empty lists
names(known_markers_list) = colnames(known_markers)
is_empty = purrr::map_int(known_markers_list, .f=length) == 0
known_markers_list = known_markers_list[!is_empty]
# Add lists to sc object
sc = ScAddLists(sc, lists=setNames(known_markers_list, paste0("known_marker_", names(known_markers_list))), lists_slot="gene_lists")
}
# Set plot options
if(length(known_markers_list) > 0) {
known_markers_n = length(known_markers_list)
known_markers_vect = unlist(known_markers_list) %>% unique() %>% sort()
idx_dotplot = sapply(seq(known_markers_list), function(x) length(known_markers_list[[x]]) <= 50)
idx_avgplot = sapply(seq(known_markers_list), function(x) length(known_markers_list[[x]]) >= 10)
} else {
known_markers_n=0
idx_dotplot = idx_avgplot = FALSE
known_markers_vect = c()
}# The height of feature and violin plots is independent of the number of clusters
# The height of dotplots is dependent on the number of clusters
height_per_row = 3
height_per_row_variable = max(2, 0.3 * length(levels(sc$seurat_clusters)))
# The total heights of average feature plots and dotplots depend on the number of lists provided
fig_height_knownMarkers_avgplot = max(height_per_row, height_per_row * sum(idx_avgplot))
fig_height_knownMarkers_dotplot = max(height_per_row_variable, height_per_row_variable * sum(idx_dotplot)) %>% min(100)
# The total height of individual feature plots depends on the total number of known markers
nr_rows = ceiling(length(known_markers_vect)/2)
fig_height_knownMarkers_vect = max(height_per_row, height_per_row * nr_rows)You provided 6 list(s) of known marker genes. In the following tabs, you find:
A dot plot visualises how gene expression changes across different clusters. The size of a dot encodes the percentage of cells in a cluster that expresses the gene, while the color encodes the scaled average expression across all cells within the cluster. Per gene (column), we group cells based on cluster identity (rows), calculate average expression per cluster, subtract the mean of average expression values and divide by the standard deviation. The resulting scores describe how high or low a gene is expressed in a cluster compared to all other clusters.
if ((known_markers_n > 0) & any(idx_dotplot)) {
known_markers_dotplot = known_markers_list[idx_dotplot]
p_list = list()
for (i in seq(known_markers_dotplot)) {
g = known_markers_dotplot[[i]]
g = g[length(g):1]
p_list[[i]] = suppressMessages(
Seurat::DotPlot(sc, features=g) +
scale_colour_gradient2(low="steelblue", mid="lightgrey", high="darkgoldenrod1") +
AddStyle(title=paste("Known marker genes:", names(known_markers_dotplot)[i]), ylab="Cluster") +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) +
lims(size=c(0,100))
)
}
p = patchwork::wrap_plots(p_list, ncol=1)
p
} else if ((known_markers_n > 0) & !any(idx_dotplot)) {
message("This tab is used for dot plots for up to 50 genes. All provided lists are longer than this, and hence dot plots are skipped.")
} else {
message("No known marker genes were provided and hence dot plots are skipped.")
}An average feature plot visualises the average gene expression of each gene list on a single-cell level, subtracted by the aggregated expression of control feature sets. The color of the plot encodes the calculated scores, whereat positive scores suggest that genes are expressed more highly than expected.
if ((known_markers_n > 0) & any(idx_avgplot)) {
known_markers_avgplot = known_markers_list[idx_avgplot]
sc = Seurat::AddModuleScore(sc, features=known_markers_avgplot, ctrl=10, name="known_markers")
idx_replace_names = grep("^known_markers[0-9]+$", colnames(sc@meta.data), perl=TRUE)
colnames(sc@meta.data)[idx_replace_names] = names(known_markers_avgplot)
p_list = Seurat::FeaturePlot(sc, features=names(known_markers_avgplot), cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
for (i in seq(known_markers_avgplot)) {
p_list[[i]] = p_list[[i]] + AddStyle(title=paste("Known marker genes:", names(known_markers_avgplot)[i]))
}
p = patchwork::wrap_plots(p_list, ncol=1)
print(p)
} else if ((known_markers_n > 0) & !any(idx_avgplot)) {
message("This tab is used to plot an average for 10 or more genes. All provided lists are shorter than this, and hence average feature plots are skipped.")
} else {
message("No known marker genes were provided and hence average feature plots are skipped.")
}× (Message)
This tab is used to plot an average for 10 or more genes. All provided lists are shorter than this, and hence average feature plots are skipped.
An individual feature plot colours single cells on the UMAP according to their normalised gene expression.
if ((known_markers_n > 0) & length(known_markers_vect) <= 100) {
p_list = Seurat::FeaturePlot(sc, features=known_markers_vect, cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle()
p = patchwork::wrap_plots(p_list, ncol=2)
print(p)
} else if (length(known_markers_vect) > 100) {
message("This tab is used to plot up to 100 known marker genes. Your provided list is longer than this, and hence individual feature plots are skipped.")
} else {
message("No known marker genes were provided and hence individual feature plots are skipped.")
}We next identify genes that are differentially expressed in one cluster compared to all other clusters, based on raw RNA data and the method “MAST.” Resulting p-values are adjusted using the Bonferroni method. However, note that the p-values are likely inflated, since both clusters and marker genes were determined based on the same gene expression data, and there ought to be gene expression differences by design. Nevertheless, p-values can be used to sort and prioritize marker genes. We require marker genes to be expressed in at least 25% of cells in the respective cluster, with a minimum log2 fold change of 1 and adjusted p-value of at most 0.05. The names of differentially expressed genes per cluster, alongside statistical measures and additional gene annotation are written to file.
As described above, cell clusters approximate cell types and states. But how do we know which cell types these are? To characterize cell clusters, we identify marker genes. Good marker genes are genes that are particularly expressed in one cluster, and existing knowledge of these marker genes can be used to extrapolate biological functions for the cluster. A good clustering of cells typically results in good marker genes. Hence, if you cannot find good marker genes you may need to go back to the start of the workflow and adapt your parameters. Note that we also determine genes that are particularly down-regulated in one cluster, even if these are not marker genes in the classical sense.
Good marker genes are highly and possibly even only expressed in one cluster as compared to all other clusters. However, sometimes marker genes are also expressed in other clusters, or are declared as marker genes in these clusters, for example cell lineage markers that are shared by different cell subtypes. To evaluate marker genes, it is essential to visualize their expression patterns.
In addition to detecting marker genes, it might be informative to detect genes that are differentially expressed between one specific cluster and one or several other clusters. This approach allows a more specific distinction of individual clusters and investigation of more subtle differences, see the section “Differentially expressed genes” below.# Find DEGs for every cluster compared to all remaining cells, report positive (=markers) and negative ones
# min.pct = requires feature to be detected at this minimum percentage in either of the two groups of cells
# logfc.threshold = requires a feature to be differentially expressed on average by some amount between the two groups
# only.pos = find only positive markers
# Review recommends using "MAST"; Mathias uses "LR"
# ALWAYS USE: assay="RNA"/"Spatial" or assay="SCT"
# DONT USE: assay=integrated datasets; this data is normalised and contains only 2k genes
# Note: By default, the function uses slot="data". Mast requires log data, so this is the correct way to do it.
# https://www.bioconductor.org/packages/release/bioc/vignettes/MAST/inst/doc/MAST-interoperability.html
markers = suppressMessages(Seurat::FindAllMarkers(sc, assay=param$assay_raw, test.use="MAST",
only.pos=FALSE, min.pct=param$marker_pct, logfc.threshold=param$marker_log2FC,
latent.vars=param$latent_vars, verbose=FALSE, silent=TRUE))
# If no markers were found, initialise the degs table so that further downstream (export) chunks run
if (ncol(markers)==0) markers = DegsEmptyMarkerResultsTable(levels(sc$seurat_clusters))
# For Seurat versions until 3.2, log fold change is based on the natural log. Convert to log base 2.
if ("avg_logFC" %in% colnames(markers) & !"avg_log2FC" %in% colnames(markers)) {
lfc_idx = grep("avg_log\\S*FC", colnames(markers))
markers[,lfc_idx] = marker_deg_results[,lfc_idx] / log(2)
col_nms = colnames(markers)
col_nms[2] = "avg_log2FC"
colnames(markers) = col_nms
}
# Sort markers
markers = markers %>% DegsSort(group=c("cluster"))
# Filter markers
markers_filt = DegsFilter(markers, cut_log2FC=param$marker_log2FC, cut_padj=param$marker_padj)
markers_found = nrow(markers_filt$all)>0
# Add average data to table
markers_out = cbind(markers_filt$all, DegsAvgDataPerIdentity(sc, genes=markers_filt$all$gene, assay=param$assay_raw))
# Split by cluster and write to file
additional_readme = data.frame(Column=c("cluster",
"p_val_adj_score",
"avg_<assay>_<slot>_id<cluster>"),
Description=c("Cluster",
"Score calculated as follows: -log10(p_val_adj)*sign(avg_log2FC)",
"Average expression value for cluster; <assay>: RNA or SCT; <slot>: raw counts or normalised data"))
invisible(DegsWriteToFile(split(markers_out, markers_out$cluster),
annot_ensembl=annot_ensembl,
gene_to_ensembl=seurat_rowname_to_ensembl,
additional_readme=additional_readme,
file=file.path(param$path_out, "marker_degs", "markers_cluster_vs_rest.xlsx")))
# Plot number of differentially expressed genes
p = DegsPlotNumbers(markers_filt$all,
group="cluster",
title=paste0("Number of DEGs, comparing each cluster to the rest\n(FC=", 2^param$marker_log2FC, ", adj. p-value=", param$marker_padj, ")"))
# Add marker table to seurat object
Seurat::Misc(sc, "markers") = list(condition_column="seurat_clusters", test="MAST", padj=param$marker_padj,
log2FC=param$marker_log2FC, min_pct=param$marker_pct, assay=param$assay_raw, slot="data",
latent_vars=param$latent_vars,
results=markers_filt$all,
enrichr=EmptyEnrichrDf(overlap_split=TRUE))
# Add marker lists to seurat object
marker_genesets_up = split(markers_filt$up$gene, markers_filt$up$cluster)
names(marker_genesets_up) = paste0("markers_up_cluster", names(marker_genesets_up))
marker_genesets_down = split(markers_filt$down$gene, markers_filt$down$cluster)
names(marker_genesets_down) = paste0("markers_down_cluster", names(marker_genesets_down))
sc = ScAddLists(sc, lists=c(marker_genesets_up, marker_genesets_down), lists_slot="gene_lists")
if (markers_found) {
p
} else {
warning("No differentially expressed genes (cluster vs rest) found. The following related code is not executed, no related plots and tables are generated.")
}We use the term “marker genes” to specifically describe genes that are up-regulated in cells of one cluster compared to the rest.
if (markers_found) {
markers_top = DegsUpDisplayTop(markers_filt$up, n=5)
# Add labels
markers_top$labels = paste0(markers_top$cluster, ": ", markers_top$gene)
# Show table
knitr::kable(markers_top %>% dplyr::select(-labels), align="l", caption="Up to top 5 marker genes per cell cluster") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="700px")
}| cluster | gene | avg_log2FC | p_val | p_val_adj | pct.1 | pct.2 |
|---|---|---|---|---|---|---|
| 1 | IL7R | 2.465 | 4.3e-25 | 6.2e-21 | 0.818 | 0.127 |
| 1 | LTB | 2.071 | 2.1e-22 | 3.0e-18 | 0.758 | 0.164 |
| 1 | RPL13A | 1.062 | 4.5e-14 | 6.5e-10 | 1.000 | 0.896 |
| 1 | CAMK4 | 1.121 | 3.4e-11 | 4.9e-07 | 0.591 | 0.119 |
| 1 | RPLP0 | 1.200 | 3.7e-11 | 5.3e-07 | 1.000 | 0.851 |
| 2 | NKG7 | 2.520 | 1.7e-28 | 2.4e-24 | 0.984 | 0.403 |
| 2 | GZMH | 2.384 | 5.4e-28 | 7.8e-24 | 0.934 | 0.144 |
| 2 | PRF1 | 2.409 | 5.1e-24 | 7.3e-20 | 0.951 | 0.273 |
| 2 | GZMB | 2.775 | 1.5e-23 | 2.2e-19 | 0.803 | 0.108 |
| 2 | FGFBP2 | 3.055 | 2.5e-23 | 3.6e-19 | 0.836 | 0.137 |
| 3 | TREML1 | 4.700 | 4.9e-11 | 7.0e-07 | 0.283 | 0.000 |
| 3 | MAP3K7CL | 4.482 | 4.0e-10 | 5.7e-06 | 0.261 | 0.052 |
| 3 | PPBP | 5.728 | 5.4e-10 | 7.8e-06 | 0.261 | 0.000 |
| 3 | GNG11 | 4.524 | 5.4e-10 | 7.8e-06 | 0.261 | 0.000 |
| 3 | SPARC | 4.894 | 7.7e-10 | 1.1e-05 | 0.283 | 0.013 |
| 4 | FCN1 | 5.198 | 1.1e-34 | 1.5e-30 | 0.926 | 0.064 |
| 4 | LYZ | 6.190 | 6.3e-34 | 9.0e-30 | 1.000 | 0.098 |
| 4 | S100A9 | 5.475 | 2.2e-29 | 3.2e-25 | 0.963 | 0.017 |
| 4 | CTSS | 4.004 | 2.7e-29 | 3.8e-25 | 1.000 | 0.341 |
| 4 | CLEC7A | 2.536 | 8.8e-29 | 1.3e-24 | 1.000 | 0.046 |
The following plots visualise the top marker genes for each cluster, respectively. Clear marker genes indicate good clusters that represent cell types.
# Note: We need to run this chunk as it specifies a variable that is used in chunk definitions below
if (markers_found) {
# The height of feature and violin plots is independent of the number of clusters
# The height of dotplots is dependent on the number of clusters
height_per_row = 3
height_per_row_variable = max(2, 0.3 * length(levels(sc$seurat_clusters)))
# Feature plots and violin plots: each row contains 5 plots
# The plot has 5 columns and 1 row per cluster, hence the layout works nicely if we find
# at least 5 markers per cluster
nr_rows_5cols = ceiling(nrow(markers_top)/5)
fig_height_5cols = height_per_row * nr_rows_5cols
# Dotplots: each row contains 2 plots
nr_rows_dp_2cols = ceiling(length(levels(sc$seurat_clusters))/2)
fig_height_dp_2cols = height_per_row_variable * nr_rows_dp_2cols %>% min(100)
} else {
fig_height_5cols = fig_height_dp_2cols = 7
}if (markers_found) {
# Plot each marker one by one, and then combine them all at the end
p_list = list()
for (i in 1:nrow(markers_top)) {
p_list[[i]] = Seurat::FeaturePlot(sc, features=markers_top$gene[i],
cols=c("lightgrey", param$col_clusters[markers_top$cluster[i]]),
combine=TRUE, label=TRUE) +
AddStyle(title=markers_top$labels[i],
xlab="", ylab="",
legend_position="bottom")
}
# Combine all plots
p = patchwork::wrap_plots(p_list, ncol=5) +
patchwork::plot_annotation(title="UMAP, cells coloured by normalised gene expression data, top marker genes per cluster")
p
}if (markers_found) {
# Plot violin plots per marker gene, and combine it all at the end
# This layout works out nicely if there are 5 marker genes per cluster
p_list = list()
for(i in 1:nrow(markers_top)) {
p_list[[i]] = Seurat::VlnPlot(sc, features=markers_top$gene[i], assay=param$assay_raw, pt.size=0, cols=param$col_clusters) +
AddStyle(title=markers_top$labels[i], xlab="")
}
p = patchwork::wrap_plots(p_list, ncol=5) +
patchwork::plot_annotation(title="Violin plot of for normalised gene expression data, top marker genes per cluster") & theme(legend.position="none")
p
}if (markers_found) {
# Visualises how feature expression changes across different clusters
# Plot dotplots per cluster, and combine it all at the end
p_list = lapply(markers_top$cluster %>% sort() %>% unique(), function(cl) {
genes = markers_top %>% dplyr::filter(cluster==cl) %>% dplyr::pull(gene)
p = suppressMessages(Seurat::DotPlot(sc, features=genes) +
scale_colour_gradient2(low="steelblue", mid="lightgrey", high="darkgoldenrod1") +
AddStyle(title=paste0("Top marker genes for cluster ", cl, " (scaled)"), ylab="Cluster", legend_position="bottom") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
guides(size=guide_legend(order=1)))
return(p)
})
p = patchwork::wrap_plots(p_list, ncol=2)
p
}if (markers_found) {
# Visualises how feature expression changes across different clusters
# Plot dotplots per cluster, and combine it all at the end
p_list = lapply(markers_top$cluster %>% sort() %>% unique(), function(cl) {
genes = markers_top %>% dplyr::filter(cluster==cl) %>% dplyr::pull(gene)
genes = genes[length(genes):1]
p = suppressMessages(DotPlotUpdated(sc, features=genes, scale=FALSE, cols=c("lightgrey", param$col)) +
AddStyle(title=paste0("Top marker genes for cluster ", cl, " (not scaled)"), ylab="Cluster", legend_position="bottom") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
guides(size=guide_legend(order=1)))
return(p)
})
p = patchwork::wrap_plots(p_list, ncol=2)
p
}If the dataset contains multiple samples, we can visualise the expression of a gene that is up-regulated in a cluster separately for each sample. For each cluster, we extract up-regulated genes, and visualise expression of these genes in all cells in that cluster, split by their sample of origin.
fig_height_degs_per_cl = max(5,
max(2, 0.3 * (sc$orig.ident %>% unique() %>% length())) * length(levels(sc$seurat_clusters)) * 2) # We multiply by 2, as the legend requires quite some spaceFirst, we plot scaled expression as explained above (see section Known marker genes). This plot allows us to judge whether the expression of a gene is increased in one sample as compared to the other samples.
if (markers_found) {
p_list = list()
markers_filt_up_top = DegsUpDisplayTop(degs=markers_filt$up, n=50)
for (cl in levels(sc$seurat_clusters)) {
markers_filt_up_cl_top = markers_filt_up_top %>%
dplyr::filter(cluster==cl) %>%
dplyr::pull(gene)
if (length(markers_filt_up_cl_top) > 0) {
p_list[[cl]] = suppressMessages(Seurat::DotPlot(sc, features=markers_filt_up_cl_top, idents=cl, group.by="orig.ident") +
scale_colour_gradient2(low="steelblue", mid="lightgrey", high="darkgoldenrod1") +
AddStyle(title=paste0("Up to 50 markers (up-regulated genes) for cluster ", cl), ylab="Cluster", legend_position="bottom") +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) +
guides(size=guide_legend(order=1)))
}
}
p = patchwork::wrap_plots(p_list, ncol=1) + patchwork::plot_annotation("Dotplot per cluster")
p
}Second, we plot normalised expression with no further scaling. This plot helps to get an impression of the total expression of a gene.
if (markers_found) {
n_genes_max_dotplot = 50
p_list = list()
for (cl in levels(sc$seurat_clusters)) {
markers_filt_up_cl_top = markers_filt$up %>%
dplyr::filter(cluster==cl) %>%
dplyr::top_n(n=n_genes_max_dotplot, wt=p_val_adj_score) %>%
dplyr::pull(gene)
if (length(markers_filt_up_cl_top) > 0) {
p_list[[cl]] = DotPlotUpdated(sc, features=markers_filt_up_cl_top, idents=cl, group.by="orig.ident", scale=FALSE, cols=c("lightgrey", param$col)) +
AddStyle(title=paste0("Up to ", n_genes_max_dotplot, " markers (up-regulated genes) for cluster ", cl, " (not scaled)"), ylab="Cluster", legend_position="bottom") +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) +
guides(size=guide_legend(order=1))
}
}
p = patchwork::wrap_plots(p_list, ncol=1) + patchwork::plot_annotation("Dotplot per cluster (not scaled)")
p
}if (markers_found) {
# This will sample 500 cells; the number of cells per seurat_cluster will be proportional
cells_subset = ScSampleCells(sc, n=500, group="seurat_clusters", group_proportional=TRUE, seed=1)
# Heatmap of all differentially expressed genes
p = Seurat::DoHeatmap(sc, features=markers_filt$all$gene, group.colors=param$col_clusters, label=FALSE, cells=cells_subset) +
NoLegend() +
theme(axis.text.y=element_blank()) +
ggtitle("Heatmap of scaled gene expression data, all genes differentially expressed between a cluster and the rest")
p
}if (markers_found) {
# This will sample 500 cells; the number of cells per seurat_cluster will be proportional
cells_subset = ScSampleCells(sc, n=500, group="seurat_clusters", group_proportional=TRUE, seed=1)
# With fig.height = 20, 300 features can be shown; distribute among clusters
features_per_group = 300/length(levels(markers_filt$up$cluster))
features_subset = markers_filt$up %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n=features_per_group, wt=avg_log2FC) %>%
dplyr::arrange(cluster, -avg_log2FC) %>%
dplyr::pull(gene) %>%
unique()
# Heatmap of top up-regulated genes
p = Seurat::DoHeatmap(sc, features=features_subset, group.colors=param$col_clusters, label=FALSE, cells=cells_subset) +
NoLegend() +
theme(axis.text.y=element_text(size=8)) +
ggtitle("Heatmap of scaled gene expression data, top genes up-regulated in a cluster compared to the rest")
p
}if (markers_found) {
# This will sample 500 cells; the number of cells per seurat_cluster will be proportional
cells_subset = ScSampleCells(sc, n=500, group="seurat_clusters", group_proportional=TRUE, seed=1)
# With fig.height = 20, 300 features can be shown; distribute among clusters
features_per_group = 300/length(levels(markers_filt$down$cluster))
features_subset = markers_filt$down %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n=features_per_group, wt=-avg_log2FC) %>%
dplyr::arrange(cluster, avg_log2FC) %>%
dplyr::pull(gene) %>%
unique()
# Heatmap of top down-regulated genes
p = Seurat::DoHeatmap(sc, features=features_subset, group.colors=param$col_clusters, label=FALSE, cells=cells_subset) +
NoLegend() +
theme(axis.text.y=element_text(size=8)) +
ggtitle("Heatmap of scaled gene expression data, top genes up-regulated in a cluster compared to the rest")
p
}To gain first insights into potential functions of cells in a cluster, we test for over-representation of functional terms amongst up- and down-regulated genes of each cluster. Over-represented terms are written to file.
We first translate gene symbols of up- and down-regulated genes per cluster into Entrez gene symbols, and then use the “enrichR” R-package to access the “Enrichr” website https://amp.pharm.mssm.edu/Enrichr/. You can choose to test functional enrichment from a wide range of databases:
# Set Enrichr database
enrichR::setEnrichrSite(param$enrichr_site)× (Message)
Connection changed to https://maayanlab.cloud/Enrichr/
× (Message)
Connection is Live!
# List databases
dbs_all = enrichR::listEnrichrDbs()
knitr::kable(dbs_all, align="l", caption="Enrichr databases") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="300px")| geneCoverage | genesPerTerm | libraryName | link | numTerms | appyter | categoryId |
|---|---|---|---|---|---|---|
| 13362 | 275 | Genome_Browser_PWMs | http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/ | 615 | ea115789fcbf12797fd692cec6df0ab4dbc79c6a | 1 |
| 27884 | 1284 | TRANSFAC_and_JASPAR_PWMs | http://jaspar.genereg.net/html/DOWNLOAD/ | 326 | 7d42eb43a64a4e3b20d721fc7148f685b53b6b30 | 1 |
| 6002 | 77 | Transcription_Factor_PPIs | 290 | 849f222220618e2599d925b6b51868cf1dab3763 | 1 | |
| 47172 | 1370 | ChEA_2013 | http://amp.pharm.mssm.edu/lib/cheadownload.jsp | 353 | 7ebe772afb55b63b41b79dd8d06ea0fdd9fa2630 | 7 |
| 47107 | 509 | Drug_Perturbations_from_GEO_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 701 | ad270a6876534b7cb063e004289dcd4d3164f342 | 7 |
| 21493 | 3713 | ENCODE_TF_ChIP-seq_2014 | http://genome.ucsc.edu/ENCODE/downloads.html | 498 | 497787ebc418d308045efb63b8586f10c526af51 | 7 |
| 1295 | 18 | BioCarta_2013 | https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 249 | 4a293326037a5229aedb1ad7b2867283573d8bcd | 7 |
| 3185 | 73 | Reactome_2013 | http://www.reactome.org/download/index.html | 78 | b343994a1b68483b0122b08650201c9b313d5c66 | 7 |
| 2854 | 34 | WikiPathways_2013 | http://www.wikipathways.org/index.php/Download_Pathways | 199 | 5c307674c8b97e098f8399c92f451c0ff21cbf68 | 7 |
| 15057 | 300 | Disease_Signatures_from_GEO_up_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 142 | 248c4ed8ea28352795190214713c86a39fd7afab | 7 |
| 4128 | 48 | KEGG_2013 | http://www.kegg.jp/kegg/download/ | 200 | eb26f55d3904cb0ea471998b6a932a9bf65d8e50 | 7 |
| 34061 | 641 | TF-LOF_Expression_from_GEO | http://www.ncbi.nlm.nih.gov/geo/ | 269 | 1 | |
| 7504 | 155 | TargetScan_microRNA | http://www.targetscan.org/cgi-bin/targetscan/data_download.cgi?db=vert_61 | 222 | f4029bf6a62c91ab29401348e51df23b8c44c90f | 7 |
| 16399 | 247 | PPI_Hub_Proteins | http://amp.pharm.mssm.edu/X2K | 385 | 69c0cfe07d86f230a7ef01b365abcc7f6e52f138 | 2 |
| 12753 | 57 | GO_Molecular_Function_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 1136 | f531ac2b6acdf7587a54b79b465a5f4aab8f00f9 | 7 |
| 23726 | 127 | GeneSigDB | https://pubmed.ncbi.nlm.nih.gov/22110038/ | 2139 | 6d655e0aa3408a7accb3311fbda9b108681a8486 | 4 |
| 32740 | 85 | Chromosome_Location | http://software.broadinstitute.org/gsea/msigdb/index.jsp | 386 | 8dab0f96078977223646ff63eb6187e0813f1433 | 7 |
| 13373 | 258 | Human_Gene_Atlas | http://biogps.org/downloads/ | 84 | 0741451470203d7c40a06274442f25f74b345c9c | 5 |
| 19270 | 388 | Mouse_Gene_Atlas | http://biogps.org/downloads/ | 96 | 31191bfadded5f96983f93b2a113cf2110ff5ddb | 5 |
| 13236 | 82 | GO_Cellular_Component_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 641 | e1d004d5797cbd2363ef54b1c3b361adb68795c6 | 7 |
| 14264 | 58 | GO_Biological_Process_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 5192 | bf120b6e11242b1a64c80910d8e89f87e618e235 | 7 |
| 3096 | 31 | Human_Phenotype_Ontology | http://www.human-phenotype-ontology.org/ | 1779 | 17a138b0b70aa0e143fe63c14f82afb70bc3ed0a | 3 |
| 22288 | 4368 | Epigenomics_Roadmap_HM_ChIP-seq | http://www.roadmapepigenomics.org/ | 383 | e1bc8a398e9b21f9675fb11bef18087eda21b1bf | 1 |
| 4533 | 37 | KEA_2013 | http://amp.pharm.mssm.edu/lib/keacommandline.jsp | 474 | 462045609440fa1e628a75716b81a1baa5bd9145 | 7 |
| 10231 | 158 | NURSA_Human_Endogenous_Complexome | https://www.nursa.org/nursa/index.jsf | 1796 | 7d3566b12ebc23dd23d9ca9bb97650f826377b16 | 2 |
| 2741 | 5 | CORUM | http://mips.helmholtz-muenchen.de/genre/proj/corum/ | 1658 | d047f6ead7831b00566d5da7a3b027ed9196e104 | 2 |
| 5655 | 342 | SILAC_Phosphoproteomics | http://amp.pharm.mssm.edu/lib/keacommandline.jsp | 84 | 54dcd9438b33301deb219866e162b0f9da7e63a0 | 2 |
| 10406 | 715 | MGI_Mammalian_Phenotype_Level_3 | http://www.informatics.jax.org/ | 71 | c3bfc90796cfca8f60cba830642a728e23a53565 | 7 |
| 10493 | 200 | MGI_Mammalian_Phenotype_Level_4 | http://www.informatics.jax.org/ | 476 | 0b09a9a1aa0af4fc7ea22d34a9ae644d45864bd6 | 7 |
| 11251 | 100 | Old_CMAP_up | http://www.broadinstitute.org/cmap/ | 6100 | 9041f90cccbc18479138330228b336265e09021c | 4 |
| 8695 | 100 | Old_CMAP_down | http://www.broadinstitute.org/cmap/ | 6100 | ebc0d905b3b3142f936d400c5f2a4ff926c81c37 | 4 |
| 1759 | 25 | OMIM_Disease | http://www.omim.org/downloads | 90 | cb2b92578a91e023d0498a334923ee84add34eca | 4 |
| 2178 | 89 | OMIM_Expanded | http://www.omim.org/downloads | 187 | 27eca242904d8e12a38cf8881395bc50d57a03e1 | 4 |
| 851 | 15 | VirusMINT | http://mint.bio.uniroma2.it/download.html | 85 | 5abad1fc36216222b0420cadcd9be805a0dda63e | 4 |
| 10061 | 106 | MSigDB_Computational | http://www.broadinstitute.org/gsea/msigdb/collections.jsp | 858 | e4cdcc7e259788fdf9b25586cce3403255637064 | 4 |
| 11250 | 166 | MSigDB_Oncogenic_Signatures | http://www.broadinstitute.org/gsea/msigdb/collections.jsp | 189 | c76f5319c33c4833c71db86a30d7e33cd63ff8cf | 4 |
| 15406 | 300 | Disease_Signatures_from_GEO_down_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 142 | aabdf7017ae55ae75a004270924bcd336653b986 | 7 |
| 17711 | 300 | Virus_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 323 | 45268b7fc680d05dd9a29743c2f2b2840a7620bf | 4 |
| 17576 | 300 | Virus_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 323 | 5f531580ccd168ee4acc18b02c6bdf8200e19d08 | 4 |
| 15797 | 176 | Cancer_Cell_Line_Encyclopedia | https://portals.broadinstitute.org/ccle/home | 967 | eb38dbc3fb20adafa9d6f9f0b0e36f378e75284f | 5 |
| 12232 | 343 | NCI-60_Cancer_Cell_Lines | http://biogps.org/downloads/ | 93 | 75c81676d8d6d99d262c9660edc024b78cfb07c9 | 5 |
| 13572 | 301 | Tissue_Protein_Expression_from_ProteomicsDB | https://www.proteomicsdb.org/ | 207 | 7 | |
| 6454 | 301 | Tissue_Protein_Expression_from_Human_Proteome_Map | http://www.humanproteomemap.org/index.php | 30 | 49351dc989f9e6ca97c55f8aca7778aa3bfb84b9 | 5 |
| 3723 | 47 | HMDB_Metabolites | http://www.hmdb.ca/downloads | 3906 | 1905132115d22e4119bce543bdacaab074edb363 | 6 |
| 7588 | 35 | Pfam_InterPro_Domains | ftp://ftp.ebi.ac.uk/pub/databases/interpro/ | 311 | e2b4912cfb799b70d87977808c54501544e4cdc9 | 6 |
| 7682 | 78 | GO_Biological_Process_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 941 | 5216d1ade194ffa5a6c00f105e2b1899f64f45fe | 7 |
| 7324 | 172 | GO_Cellular_Component_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 205 | fd1332a42395e0bc1dba82868b39be7983a48cc5 | 7 |
| 8469 | 122 | GO_Molecular_Function_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 402 | 7e3e99e5aae02437f80b0697b197113ce3209ab0 | 7 |
| 13121 | 305 | Allen_Brain_Atlas_up | http://www.brain-map.org/ | 2192 | 3804715a63a308570e47aa1a7877f01147ca6202 | 5 |
| 26382 | 1811 | ENCODE_TF_ChIP-seq_2015 | http://genome.ucsc.edu/ENCODE/downloads.html | 816 | 56b6adb4dc8a2f540357ef992d6cd93dfa2907e5 | 1 |
| 29065 | 2123 | ENCODE_Histone_Modifications_2015 | http://genome.ucsc.edu/ENCODE/downloads.html | 412 | 55b56cd8cf2ff04b26a09b9f92904008b82f3a6f | 1 |
| 280 | 9 | Phosphatase_Substrates_from_DEPOD | http://www.koehn.embl.de/depod/ | 59 | d40701e21092b999f4720d1d2b644dd0257b6259 | 2 |
| 13877 | 304 | Allen_Brain_Atlas_down | http://www.brain-map.org/ | 2192 | ea67371adec290599ddf484ced2658cfae259304 | 5 |
| 15852 | 912 | ENCODE_Histone_Modifications_2013 | http://genome.ucsc.edu/ENCODE/downloads.html | 109 | c209ae527bc8e98e4ccd27a668d36cd2c80b35b4 | 7 |
| 4320 | 129 | Achilles_fitness_increase | http://www.broadinstitute.org/achilles | 216 | 98366496a75f163164106e72439fb2bf2f77de4e | 4 |
| 4271 | 128 | Achilles_fitness_decrease | http://www.broadinstitute.org/achilles | 216 | 83a710c1ff67fd6b8af0d80fa6148c40dbd9bc64 | 4 |
| 10496 | 201 | MGI_Mammalian_Phenotype_2013 | http://www.informatics.jax.org/ | 476 | a4c6e217a81a4a58ff5a1c9fc102b70beab298e9 | 7 |
| 1678 | 21 | BioCarta_2015 | https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 239 | 70e4eb538daa7688691acfe5d9c3c19022be832b | 7 |
| 756 | 12 | HumanCyc_2015 | http://humancyc.org/ | 125 | 711f0c02b23f5e02a01207174943cfeee9d3ea9c | 7 |
| 3800 | 48 | KEGG_2015 | http://www.kegg.jp/kegg/download/ | 179 | e80d25c56de53c704791ddfdc6ab5eec28ae7243 | 7 |
| 2541 | 39 | NCI-Nature_2015 | http://pid.nci.nih.gov/ | 209 | 47edfc012bcbb368a10b717d8dca103f7814b5a4 | 7 |
| 1918 | 39 | Panther_2015 | http://www.pantherdb.org/ | 104 | ab824aeeff0712bab61f372e43aebb870d1677a9 | 7 |
| 5863 | 51 | WikiPathways_2015 | http://www.wikipathways.org/index.php/Download_Pathways | 404 | 1f7eea2f339f37856522c1f1c70ec74c7b25325f | 7 |
| 6768 | 47 | Reactome_2015 | http://www.reactome.org/download/index.html | 1389 | 36e541bee015eddb8d53827579549e30fe7a3286 | 7 |
| 25651 | 807 | ESCAPE | http://www.maayanlab.net/ESCAPE/ | 315 | a7acc741440264717ff77751a7e5fed723307835 | 5 |
| 19129 | 1594 | HomoloGene | http://www.ncbi.nlm.nih.gov/homologene | 12 | 663b665b75a804ef98add689f838b68e612f0d2a | 6 |
| 23939 | 293 | Disease_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 839 | 0f412e0802d76efa0374504c2c9f5e0624ff7f09 | 8 |
| 23561 | 307 | Disease_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 839 | 9ddc3902fb01fb9eaf1a2a7c2ff3acacbb48d37e | 8 |
| 23877 | 302 | Drug_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 906 | 068623a05ecef3e4a5e0b4f8db64bb8faa3c897f | 8 |
| 15886 | 9 | Genes_Associated_with_NIH_Grants | https://grants.nih.gov/grants/oer.htm | 32876 | 76fc5ec6735130e287e62bae6770a3c5ee068645 | 6 |
| 24350 | 299 | Drug_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 906 | c9c2155b5ac81ac496854fa61ba566dcae06cc80 | 8 |
| 3102 | 25 | KEA_2015 | http://amp.pharm.mssm.edu/Enrichr | 428 | 18a081774e6e0aaf60b1a4be7fd20afcf9e08399 | 2 |
| 31132 | 298 | Gene_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 2460 | 53dedc29ce3100930d68e506f941ef59de05dc6b | 8 |
| 30832 | 302 | Gene_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 2460 | 499882af09c62dd6da545c15cb51c1dc5e234f78 | 8 |
| 48230 | 1429 | ChEA_2015 | http://amp.pharm.mssm.edu/Enrichr | 395 | 712eb7b6edab04658df153605ec6079fa89fb5c7 | 7 |
| 5613 | 36 | dbGaP | http://www.ncbi.nlm.nih.gov/gap | 345 | 010f1267055b1a1cb036e560ea525911c007a666 | 4 |
| 9559 | 73 | LINCS_L1000_Chem_Pert_up | https://clue.io/ | 33132 | 5e678b3debe8d8ea95187d0cd35c914017af5eb3 | 4 |
| 9448 | 63 | LINCS_L1000_Chem_Pert_down | https://clue.io/ | 33132 | fedbf5e221f45ee60ebd944f92569b5eda7f2330 | 4 |
| 16725 | 1443 | GTEx_Tissue_Expression_Down | http://www.gtexportal.org/ | 2918 | 74b818bd299a9c42c1750ffe43616aa9f7929f02 | 5 |
| 19249 | 1443 | GTEx_Tissue_Expression_Up | http://www.gtexportal.org/ | 2918 | 103738763d89cae894bec9f145ac28167a90e611 | 5 |
| 15090 | 282 | Ligand_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 261 | 1eb3c0426140340527155fd0ef67029db2a72191 | 8 |
| 16129 | 292 | Aging_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 286 | cd95fe1b505ba6f28cd722cfba50fdea979d3b4c | 8 |
| 15309 | 308 | Aging_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 286 | 74c4f0a0447777005b2a5c00c9882a56dfc62d7c | 8 |
| 15103 | 318 | Ligand_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 261 | 31baa39da2931ddd5f7aedf2d0bbba77d2ba7b46 | 8 |
| 15022 | 290 | MCF7_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 401 | 555f68aef0a29a67b614a0d7e20b6303df9069c6 | 8 |
| 15676 | 310 | MCF7_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 401 | 1bc2ba607f1ff0dda44e2a15f32a2c04767da18c | 8 |
| 15854 | 279 | Microbe_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 312 | 9e613dba78ef7e60676b13493a9dc49ccd3c8b3f | 8 |
| 15015 | 321 | Microbe_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 312 | d0c3e2a68e8c611c669098df2c87b530cec3e132 | 8 |
| 3788 | 159 | LINCS_L1000_Ligand_Perturbations_down | https://clue.io/ | 96 | 957846cb05ef31fc8514120516b73cc65af7980e | 4 |
| 3357 | 153 | LINCS_L1000_Ligand_Perturbations_up | https://clue.io/ | 96 | 3bd494146c98d8189898a947f5ef5710f1b7c4b2 | 4 |
| 12668 | 300 | L1000_Kinase_and_GPCR_Perturbations_down | https://clue.io/ | 3644 | 1ccc5bce553e0c2279f8e3f4ddcfbabcf566623b | 2 |
| 12638 | 300 | L1000_Kinase_and_GPCR_Perturbations_up | https://clue.io/ | 3644 | b54a0d4ba525eac4055c7314ca9d9312adcb220c | 2 |
| 8973 | 64 | Reactome_2016 | http://www.reactome.org/download/index.html | 1530 | 1f54638e8f45075fb79489f0e0ef906594cb0678 | 2 |
| 7010 | 87 | KEGG_2016 | http://www.kegg.jp/kegg/download/ | 293 | 43f56da7540195ba3c94eb6e34c522a699b36da9 | 7 |
| 5966 | 51 | WikiPathways_2016 | http://www.wikipathways.org/index.php/Download_Pathways | 437 | 340be98b444cad50bb974df69018fd598e23e5e1 | 7 |
| 15562 | 887 | ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X | 104 | 5426f7747965c23ef32cff46fabf906e2cd76bfa | 1 | |
| 17850 | 300 | Kinase_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 285 | bb9682d78b8fc43be842455e076166fcd02cefc3 | 2 |
| 17660 | 300 | Kinase_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 285 | 78618915009cac3a0663d6f99d359e39a31b6660 | 2 |
| 1348 | 19 | BioCarta_2016 | http://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 237 | 13d9ab18921d5314a5b2b366f6142b78ab0ff6aa | 2 |
| 934 | 13 | HumanCyc_2016 | http://humancyc.org/ | 152 | d6a502ef9b4c789ed5e73ca5a8de372796e5c72a | 2 |
| 2541 | 39 | NCI-Nature_2016 | http://pid.nci.nih.gov/ | 209 | 3c1e1f7d1a651d9aaa198e73704030716fc09431 | 2 |
| 2041 | 42 | Panther_2016 | http://www.pantherdb.org/pathway/ | 112 | ca5f6abf7f75d9baae03396e84d07300bf1fd051 | 2 |
| 5209 | 300 | DrugMatrix | https://ntp.niehs.nih.gov/drugmatrix/ | 7876 | 255c3db820d612f34310f22a6985dad50e9fe1fe | 4 |
| 49238 | 1550 | ChEA_2016 | http://amp.pharm.mssm.edu/Enrichr | 645 | af271913344aa08e6a755af1d433ef15768d749a | 1 |
| 2243 | 19 | huMAP | http://proteincomplexes.org/ | 995 | 249247d2f686d3eb4b9e4eb976c51159fac80a89 | 2 |
| 19586 | 545 | Jensen_TISSUES | http://tissues.jensenlab.org/ | 1842 | e8879ab9534794721614d78fe2883e9e564d7759 | 3 |
| 22440 | 505 | RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO | http://www.ncbi.nlm.nih.gov/geo/ | 1302 | f0752e4d7f5198f86446678966b260c530d19d78 | 8 |
| 8184 | 24 | MGI_Mammalian_Phenotype_2017 | http://www.informatics.jax.org/ | 5231 | 0705e59bff98deda6e9cbe00cfcdd871c85e7d04 | 7 |
| 18329 | 161 | Jensen_COMPARTMENTS | http://compartments.jensenlab.org/ | 2283 | 56ec68c32d4e83edc2ee83bea0e9f6a3829b2279 | 3 |
| 15755 | 28 | Jensen_DISEASES | http://diseases.jensenlab.org/ | 1811 | 3045dff8181367c1421627bb8e4c5a32c6d67f98 | 3 |
| 10271 | 22 | BioPlex_2017 | http://bioplex.hms.harvard.edu/ | 3915 | b8620b1a9d0d271d1a2747d8cfc63589dba39991 | 2 |
| 10427 | 38 | GO_Cellular_Component_2017 | http://www.geneontology.org/ | 636 | 8fed21d22dfcc3015c05b31d942fdfc851cc8e04 | 7 |
| 10601 | 25 | GO_Molecular_Function_2017 | http://www.geneontology.org/ | 972 | b4018906e0a8b4e81a1b1afc51e0a2e7655403eb | 7 |
| 13822 | 21 | GO_Biological_Process_2017 | http://www.geneontology.org/ | 3166 | d9da4dba4a3eb84d4a28a3835c06dfbbe5811f92 | 7 |
| 8002 | 143 | GO_Cellular_Component_2017b | http://www.geneontology.org/ | 816 | ecf39c41fa5bc7deb625a2b5761a708676e9db7c | 7 |
| 10089 | 45 | GO_Molecular_Function_2017b | http://www.geneontology.org/ | 3271 | 8d8340361dd36a458f1f0a401f1a3141de1f3200 | 7 |
| 13247 | 49 | GO_Biological_Process_2017b | http://www.geneontology.org/ | 10125 | 6404c38bffc2b3732de4e3fbe417b5043009fe34 | 7 |
| 21809 | 2316 | ARCHS4_Tissues | http://amp.pharm.mssm.edu/archs4 | 108 | 4126374338235650ab158ba2c61cd2e2383b70df | 5 |
| 23601 | 2395 | ARCHS4_Cell-lines | http://amp.pharm.mssm.edu/archs4 | 125 | 5496ef9c9ae9429184d0b9485c23ba468ee522a8 | 5 |
| 20883 | 299 | ARCHS4_IDG_Coexp | http://amp.pharm.mssm.edu/archs4 | 352 | ce60be284fdd5a9fc6240a355421a9e12b1ee84a | 4 |
| 19612 | 299 | ARCHS4_Kinases_Coexp | http://amp.pharm.mssm.edu/archs4 | 498 | 6721c5ed97b7772e4a19fdc3f797110df0164b75 | 2 |
| 25983 | 299 | ARCHS4_TFs_Coexp | http://amp.pharm.mssm.edu/archs4 | 1724 | 8a468c3ae29fa68724f744cbef018f4f3b61c5ab | 1 |
| 19500 | 137 | SysMyo_Muscle_Gene_Sets | http://sys-myo.rhcloud.com/ | 1135 | 8 | |
| 14893 | 128 | miRTarBase_2017 | http://mirtarbase.mbc.nctu.edu.tw/ | 3240 | 6b7c7fe2a97b19aecbfba12d8644af6875ad99c4 | 1 |
| 17598 | 1208 | TargetScan_microRNA_2017 | http://www.targetscan.org/ | 683 | 79d13fb03d2fa6403f9be45c90eeda0f6822e269 | 1 |
| 5902 | 109 | Enrichr_Libraries_Most_Popular_Genes | http://amp.pharm.mssm.edu/Enrichr | 121 | e9b7d8ee237d0a690bd79d970a23a9fa849901ed | 6 |
| 12486 | 299 | Enrichr_Submissions_TF-Gene_Coocurrence | http://amp.pharm.mssm.edu/Enrichr | 1722 | be2ca8ef5a8c8e17d7e7bd290e7cbfe0951396c0 | 1 |
| 1073 | 100 | Data_Acquisition_Method_Most_Popular_Genes | http://amp.pharm.mssm.edu/Enrichr | 12 | 17ce5192b9eba7d109b6d228772ea8ab222e01ef | 6 |
| 19513 | 117 | DSigDB | http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/ | 4026 | 287476538ab98337dbe727b3985a436feb6d192a | 4 |
| 14433 | 36 | GO_Biological_Process_2018 | http://www.geneontology.org/ | 5103 | b5b77681c46ac58cd050e60bcd4ad5041a9ab0a9 | 7 |
| 8655 | 61 | GO_Cellular_Component_2018 | http://www.geneontology.org/ | 446 | e9ebe46188efacbe1056d82987ff1c70218fa7ae | 7 |
| 11459 | 39 | GO_Molecular_Function_2018 | http://www.geneontology.org/ | 1151 | 79ff80ae9a69dd00796e52569e41422466fa0bee | 7 |
| 19741 | 270 | TF_Perturbations_Followed_by_Expression | http://www.ncbi.nlm.nih.gov/geo/ | 1958 | 34d08a4878c19584aaf180377f2ea96faa6a6eb1 | 1 |
| 27360 | 802 | Chromosome_Location_hg19 | http://hgdownload.cse.ucsc.edu/downloads.html | 36 | fdab39c467ba6b0fb0288df1176d7dfddd7196d5 | 6 |
| 13072 | 26 | NIH_Funded_PIs_2017_Human_GeneRIF | https://www.ncbi.nlm.nih.gov/pubmed/ | 5687 | 859b100fac3ca774ad84450b1fbb65a78fcc6b12 | 6 |
| 13464 | 45 | NIH_Funded_PIs_2017_Human_AutoRIF | https://www.ncbi.nlm.nih.gov/pubmed/ | 12558 | fc5bf033b932cf173633e783fc8c6228114211f8 | 6 |
| 13787 | 200 | Rare_Diseases_AutoRIF_ARCHS4_Predictions | https://amp.pharm.mssm.edu/geneshot/ | 3725 | 375ff8cdd64275a916fa24707a67968a910329bb | 4 |
| 13929 | 200 | Rare_Diseases_GeneRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/gene/about-generif | 2244 | 0f7fb7f347534779ecc6c87498e96b5460a8d652 | 4 |
| 16964 | 200 | NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/pubmed/ | 12558 | f77de51aaf0979dd6f56381cf67ba399b4640d28 | 6 |
| 17258 | 200 | NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/pubmed/ | 5684 | 25fa899b715cd6a9137f6656499f89cd25144029 | 6 |
| 10352 | 58 | Rare_Diseases_GeneRIF_Gene_Lists | https://www.ncbi.nlm.nih.gov/gene/about-generif | 2244 | 0fb9ac92dbe52024661c088f71a1134f00567a8b | 4 |
| 10471 | 76 | Rare_Diseases_AutoRIF_Gene_Lists | https://amp.pharm.mssm.edu/geneshot/ | 3725 | ee3adbac2da389959410260b280e7df1fd3730df | 4 |
| 12419 | 491 | SubCell_BarCode | http://www.subcellbarcode.org/ | 104 | b50bb9480d8a77103fb75b331fd9dd927246939a | 2 |
| 19378 | 37 | GWAS_Catalog_2019 | https://www.ebi.ac.uk/gwas | 1737 | fef3864bcb5dd9e60cee27357eff30226116c49b | 4 |
| 6201 | 45 | WikiPathways_2019_Human | https://www.wikipathways.org/ | 472 | b0c9e9ebb9014f14561e896008087725a2db24b7 | 7 |
| 4558 | 54 | WikiPathways_2019_Mouse | https://www.wikipathways.org/ | 176 | e7750958da20f585c8b6d5bc4451a5a4305514ba | 7 |
| 3264 | 22 | TRRUST_Transcription_Factors_2019 | https://www.grnpedia.org/trrust/ | 571 | 5f8cf93e193d2bcefa5a37ccdf0eefac576861b0 | 1 |
| 7802 | 92 | KEGG_2019_Human | https://www.kegg.jp/ | 308 | 3477bc578c4ea5d851dcb934fe2a41e9fd789bb4 | 7 |
| 8551 | 98 | KEGG_2019_Mouse | https://www.kegg.jp/ | 303 | 187eb44b2d6fa154ebf628eba1f18537f64e797c | 7 |
| 12444 | 23 | InterPro_Domains_2019 | https://www.ebi.ac.uk/interpro/ | 1071 | 18dd5ec520fdf589a93d6a7911289c205e1ddf22 | 6 |
| 9000 | 20 | Pfam_Domains_2019 | https://pfam.xfam.org/ | 608 | a6325ed264f9ac9e6518796076c46a1d885cca7a | 6 |
| 7744 | 363 | DepMap_WG_CRISPR_Screens_Broad_CellLines_2019 | https://depmap.org/ | 558 | 0b08b32b20854ac8a738458728a9ea50c2e04800 | 4 |
| 6204 | 387 | DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019 | https://depmap.org/ | 325 | b7c4ead26d0eb64f1697c030d31682b581c8bb56 | 4 |
| 13420 | 32 | MGI_Mammalian_Phenotype_Level_4_2019 | http://www.informatics.jax.org/ | 5261 | f1bed632e89ebc054da44236c4815cdce03ef5ee | 7 |
| 14148 | 122 | UK_Biobank_GWAS_v1 | https://www.ukbiobank.ac.uk/tag/gwas/ | 857 | 958fb52e6215626673a5acf6e9289a1b84d11b4a | 4 |
| 9813 | 49 | BioPlanet_2019 | https://tripod.nih.gov/bioplanet/ | 1510 | e110851dfc763d30946f2abedcc2cd571ac357a0 | 2 |
| 1397 | 13 | ClinVar_2019 | https://www.ncbi.nlm.nih.gov/clinvar/ | 182 | 0a95303f8059bec08836ecfe02ce3da951150547 | 4 |
| 9116 | 22 | PheWeb_2019 | http://pheweb.sph.umich.edu/ | 1161 | 6a7c7321b6b72c5285b722f7902d26a2611117cb | 4 |
| 17464 | 63 | DisGeNET | https://www.disgenet.org | 9828 | 3c261626478ce9e6bf2c7f0a8014c5e901d43dc0 | 4 |
| 394 | 73 | HMS_LINCS_KinomeScan | http://lincs.hms.harvard.edu/kinomescan/ | 148 | 47ba06cdc92469ac79400fc57acd84ba343ba616 | 2 |
| 11851 | 586 | CCLE_Proteomics_2020 | https://portals.broadinstitute.org/ccle | 378 | 7094b097ae2301a1d6a5bd856a193b084cca993d | 5 |
| 8189 | 421 | ProteomicsDB_2020 | https://www.proteomicsdb.org/ | 913 | 8c87c8346167bac2ba68195a32458aba9b1acfd1 | 5 |
| 18704 | 100 | lncHUB_lncRNA_Co-Expression | https://amp.pharm.mssm.edu/lnchub/ | 3729 | 45b597d7efa5693b7e4172b09c0ed2dda3305582 | 1 |
| 5605 | 39 | Virus-Host_PPI_P-HIPSTer_2020 | http://phipster.org/ | 6715 | a592eed13e8e9496aedbab63003b965574e46a65 | 2 |
| 5718 | 31 | Elsevier_Pathway_Collection | http://www.transgene.ru/disease-pathways/ | 1721 | 9196c760e3bcae9c9de1e3f87ad81f96bde24325 | 2 |
| 14156 | 40 | Table_Mining_of_CRISPR_Studies | 802 | ad580f3864fa8ff69eaca11f6d2e7f9b86378d08 | 6 | |
| 16979 | 295 | COVID-19_Related_Gene_Sets | https://amp.pharm.mssm.edu/covid19 | 205 | 72b0346849570f66a77a6856722601e711596cb4 | 7 |
| 4383 | 146 | MSigDB_Hallmark_2020 | https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp | 50 | 6952efda94663d4bd8db09bf6eeb4e67d21ef58c | 2 |
| 54974 | 483 | Enrichr_Users_Contributed_Lists_2020 | https://maayanlab.cloud/Enrichr | 1482 | 8dc362703b38b30ac3b68b6401a9b20a58e7d3ef | 6 |
| 12118 | 448 | TG_GATES_2020 | https://toxico.nibiohn.go.jp/english/ | 1190 | 9e32560437b11b4628b00ccf3d584360f7f7daee | 4 |
| 12361 | 124 | Allen_Brain_Atlas_10x_scRNA_2021 | https://portal.brain-map.org/ | 766 | 46f8235cb585829331799a71aec3f7c082170219 | 5 |
| 9763 | 139 | Descartes_Cell_Types_and_Tissue_2021 | https://descartes.brotmanbaty.org/bbi/human-gene-expression-during-development/ | 172 | 5 | |
| 8078 | 102 | KEGG_2021_Human | https://www.kegg.jp/ | 320 | 2 | |
| 7173 | 43 | WikiPathway_2021_Human | https://www.wikipathways.org/ | 622 | 2 | |
| 5833 | 100 | HuBMAP_ASCT_plus_B_augmented_w_RNAseq_Coexpression | https://hubmapconsortium.github.io/ccf-asct-reporter/ | 344 | 5 | |
| 14937 | 33 | GO_Biological_Process_2021 | http://www.geneontology.org/ | 6036 | 3 | |
| 11497 | 80 | GO_Cellular_Component_2021 | http://www.geneontology.org/ | 511 | 3 | |
| 11936 | 34 | GO_Molecular_Function_2021 | http://www.geneontology.org/ | 1274 | 3 | |
| 9767 | 33 | MGI_Mammalian_Phenotype_Level_4_2021 | http://www.informatics.jax.org/ | 4601 | 3 | |
| 14167 | 80 | CellMarker_Augmented_2021 | http://biocc.hrbmu.edu.cn/CellMarker/ | 1097 | 5 | |
| 17851 | 102 | Orphanet_Augmented_2021 | http://www.orphadata.org/ | 3774 | 4 | |
| 16853 | 360 | COVID-19_Related_Gene_Sets_2021 | https://maayanlab.cloud/covid19/ | 478 | 4 | |
| 6654 | 136 | PanglaoDB_Augmented_2021 | https://panglaodb.se/ | 178 | 5 | |
| 1683 | 10 | Azimuth_Cell_Types_2021 | https://azimuth.hubmapconsortium.org/ | 341 | 5 | |
| 20414 | 112 | PhenGenI_Association_2021 | https://www.ncbi.nlm.nih.gov/gap/phegeni | 950 | 4 | |
| 26076 | 250 | RNAseq_Automatic_GEO_Signatures_Human_Down | https://maayanlab.cloud/archs4/ | 4269 | 8 | |
| 26338 | 250 | RNAseq_Automatic_GEO_Signatures_Human_Up | https://maayanlab.cloud/archs4/ | 4269 | 8 | |
| 25381 | 250 | RNAseq_Automatic_GEO_Signatures_Mouse_Down | https://maayanlab.cloud/archs4/ | 4216 | 8 | |
| 25409 | 250 | RNAseq_Automatic_GEO_Signatures_Mouse_Up | https://maayanlab.cloud/archs4/ | 4216 | 8 | |
| 11980 | 250 | GTEx_Aging_Signatures_2021 | https://gtexportal.org/ | 270 | 4 | |
| 31158 | 805 | HDSigDB_Human_2021 | https://www.hdinhd.org/ | 2564 | 4 | |
| 30006 | 815 | HDSigDB_Mouse_2021 | https://www.hdinhd.org/ | 2579 | 4 |
markers_enriched=list()if (markers_found) {
# Upregulated markers
# Convert Seurat names of upregulated marker per cluster to Entrez; use named lists for translation
# Is that still neccessary?
marker_genesets_up = sapply(levels(sc$seurat_clusters), function(x) {
tmp = markers_filt$up %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
tmp = sapply(tmp, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
return(tmp[!is.na(tmp)])
}, USE.NAMES=TRUE, simplify=TRUE)
# Tests done by Enrichr
marker_genesets_up_enriched = purrr::map(marker_genesets_up, EnrichrTest, databases=param$enrichr_dbs, padj=param$enrichr_padj)
marker_genesets_up_enriched = purrr::map(list_names(marker_genesets_up_enriched), function(n) {
return(purrr::map(marker_genesets_up_enriched[[n]], function(d){
return(cbind(d, Cluster=rep(n, nrow(d)), Direction=rep("up", nrow(d))))
}))
})
# Write to files
invisible(purrr::map(names(marker_genesets_up_enriched), function(n) {
EnrichrWriteResults(enrichr_results=marker_genesets_up_enriched[[n]],
file=file.path(param$path_out, "marker_degs", paste0("functions_marker_up_cluster_", n, "_vs_rest.xlsx")))
}))
# Downregulated markers
# Convert Seurat names of downregulated marker per cluster to Entrez; use named lists for translation
# Is that still neccessary?
marker_genesets_down = sapply(levels(sc$seurat_clusters), function(x) {
tmp = markers_filt$down %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
tmp = sapply(tmp, function(x) seurat_rowname_to_entrez[[x]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
return(tmp[!is.na(tmp)])
}, USE.NAMES=TRUE, simplify=TRUE)
# Tests done by Enrichr
marker_genesets_down_enriched = purrr::map(marker_genesets_down, EnrichrTest, databases=param$enrichr_dbs, padj=param$enrichr_padj)
marker_genesets_down_enriched = purrr::map(list_names(marker_genesets_down_enriched), function(n) {
return(purrr::map(marker_genesets_down_enriched[[n]], function(d){
return(cbind(d, Cluster=rep(n, nrow(d)), Direction=rep("down", nrow(d))))
}))
})
# Write to files
invisible(purrr::map(names(marker_genesets_down_enriched), function(n) {
EnrichrWriteResults(enrichr_results=marker_genesets_down_enriched[[n]],
file=file.path(param$path_out, "marker_degs", paste0("functions_marker_down_cluster_", n, "_vs_rest.xlsx")))
}))
# Combine, flatten into data.frame and add to sc misc slot
marker_genesets_enriched = c(marker_genesets_up_enriched, marker_genesets_down_enriched)
marker_genesets_enriched = unname(marker_genesets_enriched)
marker_genesets_enriched = purrr::map(marker_genesets_enriched, FlattenEnrichr) %>% dplyr::bind_rows()
marker_genesets_enriched$Cluster = factor(marker_genesets_enriched$Cluster, levels=levels(sc$seurat_clusters))
marker_genesets_enriched$Direction = factor(marker_genesets_enriched$Direction, levels=c("up", "down"))
misc_content = Misc(sc, "markers")
misc_content[["enrichr"]] = marker_genesets_enriched
suppressWarnings({Misc(sc, "markers") = misc_content})
}The following table contains the top enriched terms per cluster.
# Top enriched terms (TODO: better plots, functions)
if (markers_found) {
cat("#### {.tabset} \n \n")
# Get top ten up and down over all databases per cluster
marker_genesets_top_enriched = marker_genesets_enriched %>% dplyr::group_by(Cluster, Direction) %>%
dplyr::top_n(n=10, wt=Combined.Score)
# Print as tabs
for(n in levels(marker_genesets_top_enriched$Cluster)){
cat("##### ", n, " \n")
print(knitr::kable(marker_genesets_top_enriched %>% dplyr::ungroup() %>% dplyr::filter(Cluster==n) %>% dplyr::select(Database, Term, Direction, Adjusted.P.value, Odds.Ratio, Combined.Score),
align="l", caption="Top ten enriched terms per geneset", format="html") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="700px"))
cat(" \n \n")
}
cat(" \n \n")
}| Database | Term | Direction | Adjusted.P.value | Odds.Ratio | Combined.Score |
|---|---|---|---|---|---|
| Azimuth_Cell_Types_2021 | CD4+ Naive T CL0000895 | up | 0.0e+00 | 907.5000 | 39829.636 |
| Azimuth_Cell_Types_2021 | CD4+ Central Memory T CL0000897 | up | 0.0e+00 | 665.6667 | 14526.484 |
| Azimuth_Cell_Types_2021 | CD4+ Effector Memory T 3 CL0001044 | up | 0.0e+00 | 665.6667 | 14526.484 |
| Azimuth_Cell_Types_2021 | CD4 T CL0000624 | up | 7.0e-07 | 407.5306 | 6326.449 |
| Azimuth_Cell_Types_2021 | CD4+ Central Memory T 3 CL0000897 | up | 7.0e-07 | 407.5306 | 6326.449 |
| Azimuth_Cell_Types_2021 | CD4+ Effector Memory T CL0001044 | up | 7.0e-07 | 407.5306 | 6326.449 |
| Azimuth_Cell_Types_2021 | CD4+ T Cell CL0000624 | up | 7.0e-07 | 407.5306 | 6326.449 |
| Azimuth_Cell_Types_2021 | Naive Regulatory T CL0000815 | up | 7.0e-07 | 407.5306 | 6326.449 |
| CellMarker_Augmented_2021 | Naive CD8+ T cell:Peripheral Blood | up | 0.0e+00 | 173.2927 | 6911.883 |
| CellMarker_Augmented_2021 | Naive T cell:Undefined | up | 1.1e-06 | 1426.7143 | 25687.802 |
| CellMarker_Augmented_2021 | Central Memory T cell:Undefined | up | 2.4e-06 | 713.2857 | 11950.146 |
| Azimuth_Cell_Types_2021 | CD56-dim Natural Killer CL0000939 | down | 0.0e+00 | 2305.0385 | 85950.828 |
| Azimuth_Cell_Types_2021 | CD56-dim Natural Killer 2 CL0000939 | down | 0.0e+00 | 2305.0385 | 85950.828 |
| Azimuth_Cell_Types_2021 | Natural Killer Cell CL0000623 | down | 0.0e+00 | 2305.0385 | 85950.828 |
| Azimuth_Cell_Types_2021 | CD56-dim Natural Killer 1 CL0000939 | down | 0.0e+00 | 1426.8571 | 42580.540 |
| Azimuth_Cell_Types_2021 | CD4+ Cytotoxic T CL0000934 | down | 0.0e+00 | 891.6518 | 25156.399 |
| Azimuth_Cell_Types_2021 | CD8+ Effector Memory T CL0001050 | down | 0.0e+00 | 887.7778 | 20267.772 |
| Azimuth_Cell_Types_2021 | Natural Killer CL0000623 | down | 0.0e+00 | 665.7667 | 14629.232 |
| Azimuth_Cell_Types_2021 | CD8+ Effector Memory T 1 CL0001050 | down | 2.0e-07 | 749.1000 | 12749.739 |
| CellMarker_Augmented_2021 | CD4+ Cytotoxic T cell:Liver | down | 0.0e+00 | 567.4671 | 34686.018 |
| CellMarker_Augmented_2021 | Cytotoxic T cell:Blood | down | 0.0e+00 | 245.5679 | 10326.131 |
| CellMarker_Augmented_2021 | Cytotoxic T cell:Peripheral Blood | down | 0.0e+00 | 245.5679 | 10326.131 |
| CellMarker_Augmented_2021 | T cell:Ascites | down | 0.0e+00 | 245.5679 | 10326.131 |
| CellMarker_Augmented_2021 | pro-Natural Killer Cell (pro-NK cell):Peripheral Blood | down | 0.0e+00 | 245.5679 | 10326.131 |
| Database | Term | Direction | Adjusted.P.value | Odds.Ratio | Combined.Score |
|---|---|---|---|---|---|
| Azimuth_Cell_Types_2021 | Natural Killer Cell CL0000623 | up | 0.0000000 | 2957.4815 | 141499.799 |
| Azimuth_Cell_Types_2021 | Natural Killer CL0000623 | up | 0.0000000 | 1478.5926 | 67200.696 |
| Azimuth_Cell_Types_2021 | CD56-dim Natural Killer CL0000939 | up | 0.0000000 | 1663.5000 | 67028.889 |
| Azimuth_Cell_Types_2021 | CD4+ Cytotoxic T CL0000934 | up | 0.0000000 | 831.6250 | 31300.047 |
| Azimuth_Cell_Types_2021 | CD56-dim Natural Killer 1 CL0000939 | up | 0.0000000 | 1032.4655 | 34277.270 |
| Azimuth_Cell_Types_2021 | CD56-dim Natural Killer 2 CL0000939 | up | 0.0000000 | 1032.4655 | 34277.270 |
| Azimuth_Cell_Types_2021 | CD8+ Effector Memory T CL0001050 | up | 0.0000000 | 1032.4655 | 34277.270 |
| Azimuth_Cell_Types_2021 | CD8+ Effector Memory T 4 CL0001050 | up | 0.0000000 | 1032.4655 | 34277.270 |
| CellMarker_Augmented_2021 | CD4+ Cytotoxic T cell:Liver | up | 0.0000000 | 777.3828 | 93361.448 |
| CellMarker_Augmented_2021 | Effector CD8+ Memory T (Tem) cell:Peripheral Blood | up | 0.0000000 | 289.1453 | 21120.060 |
| Azimuth_Cell_Types_2021 | CD4+ Central Memory T CL0000897 | down | 0.0000111 | 1249.1250 | 15739.235 |
| Azimuth_Cell_Types_2021 | CD4+ Central Memory T 1 CL0000897 | down | 0.0000111 | 1249.1250 | 15739.235 |
| Azimuth_Cell_Types_2021 | CD4+ Central Memory T 3 CL0000897 | down | 0.0000111 | 1249.1250 | 15739.235 |
| Azimuth_Cell_Types_2021 | CD4+ Effector Memory T CL0001044 | down | 0.0000111 | 1249.1250 | 15739.235 |
| Azimuth_Cell_Types_2021 | CD4+ Effector Memory T 3 CL0001044 | down | 0.0000111 | 1249.1250 | 15739.235 |
| Azimuth_Cell_Types_2021 | CD4+ T Cell CL0000624 | down | 0.0000111 | 1249.1250 | 15739.235 |
| Azimuth_Cell_Types_2021 | CD8+ Central Memory T 2 CL0000909 | down | 0.0000111 | 1249.1250 | 15739.235 |
| Azimuth_Cell_Types_2021 | CD4+ Naive T CL0000895 | down | 0.0000363 | 587.5588 | 6619.668 |
| Azimuth_Cell_Types_2021 | CD16+ Monocyte CL0002396 | down | 0.0000363 | 554.8889 | 6193.206 |
| CellMarker_Augmented_2021 | Naive T cell:Undefined | down | 0.0237817 | 999.5000 | 6499.559 |
| Database | Term | Direction | Adjusted.P.value | Odds.Ratio | Combined.Score |
|---|---|---|---|---|---|
| GO_Biological_Process_2018 | platelet degranulation (GO:0002576) | up | 0.0000480 | 49.08304 | 756.0321 |
| GO_Biological_Process_2018 | negative regulation of release of cytochrome c from mitochondria (GO:0090201) | up | 0.0070635 | 153.57692 | 1386.2017 |
| GO_Biological_Process_2018 | lipoxygenase pathway (GO:0019372) | up | 0.0070635 | 153.57692 | 1386.2017 |
| GO_Cellular_Component_2018 | platelet alpha granule lumen (GO:0031093) | up | 0.0000322 | 70.24691 | 985.8344 |
| GO_Cellular_Component_2018 | platelet alpha granule (GO:0031091) | up | 0.0000527 | 51.40052 | 660.3086 |
| Azimuth_Cell_Types_2021 | Platelet CL0000233 | up | 0.0000000 | 554.72222 | 11837.4045 |
| CellMarker_Augmented_2021 | Cancer Stem cell:Mammary Gland | up | 0.0000001 | 80.18145 | 1671.6144 |
| CellMarker_Augmented_2021 | Sertoli cell:Testis | up | 0.0292381 | 237.78571 | 1237.7033 |
| Descartes_Cell_Types_and_Tissue_2021 | Megakaryocytes in Lung | up | 0.0000008 | 40.12095 | 677.7705 |
| Descartes_Cell_Types_and_Tissue_2021 | Megakaryocytes in Adrenal | up | 0.0000009 | 56.75328 | 913.4317 |
| GO_Molecular_Function_2018 | C3HC4-type RING finger domain binding (GO:0055131) | down | 0.0325492 | 333.03333 | 1847.7667 |
| GO_Molecular_Function_2018 | primary miRNA binding (GO:0070878) | down | 0.0325492 | 277.51389 | 1497.0326 |
| GO_Biological_Process_2018 | alternative mRNA splicing, via spliceosome (GO:0000380) | down | 0.0056933 | 363.21818 | 3839.6286 |
| GO_Biological_Process_2018 | regulation of protein transport (GO:0051223) | down | 0.0150563 | 145.17818 | 1292.8828 |
| GO_Biological_Process_2018 | positive regulation of podosome assembly (GO:0071803) | down | 0.0479267 | 333.03333 | 1847.7667 |
| GO_Biological_Process_2018 | pri-miRNA transcription from RNA polymerase II promoter (GO:0061614) | down | 0.0479267 | 277.51389 | 1497.0326 |
| GO_Biological_Process_2018 | granulocyte migration (GO:0097530) | down | 0.0479267 | 277.51389 | 1497.0326 |
| GO_Biological_Process_2018 | formation of translation preinitiation complex (GO:0001731) | down | 0.0479267 | 277.51389 | 1497.0326 |
| GO_Biological_Process_2018 | actin filament network formation (GO:0051639) | down | 0.0479267 | 237.85714 | 1251.4164 |
| GO_Biological_Process_2018 | positive regulation of protein localization to Cajal body (GO:1904871) | down | 0.0479267 | 237.85714 | 1251.4164 |
| GO_Biological_Process_2018 | positive regulation of production of miRNAs involved in gene silencing by miRNA (GO:1903800) | down | 0.0479267 | 237.85714 | 1251.4164 |
| GO_Biological_Process_2018 | regulation of protein localization to Cajal body (GO:1904869) | down | 0.0479267 | 237.85714 | 1251.4164 |
| GO_Cellular_Component_2018 | clathrin-sculpted gamma-aminobutyric acid transport vesicle membrane (GO:0061202) | down | 0.0275018 | 237.85714 | 1251.4164 |
| GO_Cellular_Component_2018 | clathrin-sculpted gamma-aminobutyric acid transport vesicle (GO:0061200) | down | 0.0275018 | 237.85714 | 1251.4164 |
| CellMarker_Augmented_2021 | Natural Killer T (NKT) cell:Fetal Kidney | down | 0.0000005 | 200941.00000 | 3874412.2288 |
| Database | Term | Direction | Adjusted.P.value | Odds.Ratio | Combined.Score |
|---|---|---|---|---|---|
| Azimuth_Cell_Types_2021 | CD14+ Monocyte CL0002057 | up | 0 | 492.60000 | 26910.14 |
| Azimuth_Cell_Types_2021 | Monocyte CL0000576 | up | 0 | 197060.00000 | 8345967.83 |
| CellMarker_Augmented_2021 | Monocyte:Fetal Kidney | up | 0 | 82.94584 | 44800.62 |
| CellMarker_Augmented_2021 | Monocyte:Lung | up | 0 | 115.06376 | 21367.20 |
| CellMarker_Augmented_2021 | Monocyte:Undefined | up | 0 | 109.82242 | 20210.32 |
| CellMarker_Augmented_2021 | Monocyte:Liver | up | 0 | 109.97851 | 19903.89 |
| CellMarker_Augmented_2021 | Macrophage:Synovium | up | 0 | 109.97851 | 19903.89 |
| CellMarker_Augmented_2021 | Monocyte:Plasma | up | 0 | 102.83473 | 17735.13 |
| CellMarker_Augmented_2021 | Monocyte:Synovium | up | 0 | 102.83473 | 17735.13 |
| CellMarker_Augmented_2021 | Classical monocyte:Blood | up | 0 | 96.16304 | 16067.89 |
| CellMarker_Augmented_2021 | Intermediate monocyte:Blood | up | 0 | 96.16304 | 16067.89 |
| CellMarker_Augmented_2021 | Non-classical monocyte:Blood | up | 0 | 96.16304 | 16067.89 |
| CellMarker_Augmented_2021 | CD14++CD16- monocyte:Blood | up | 0 | 96.16304 | 16067.89 |
| GO_Biological_Process_2018 | SRP-dependent cotranslational protein targeting to membrane (GO:0006614) | down | 0 | 262.04216 | 21935.22 |
| GO_Biological_Process_2018 | cotranslational protein targeting to membrane (GO:0006613) | down | 0 | 247.63387 | 20485.79 |
| GO_Biological_Process_2018 | protein targeting to ER (GO:0045047) | down | 0 | 234.72255 | 19197.98 |
| GO_Biological_Process_2018 | viral gene expression (GO:0019080) | down | 0 | 200.68687 | 15858.80 |
| GO_Biological_Process_2018 | nuclear-transcribed mRNA catabolic process, nonsense-mediated decay (GO:0000184) | down | 0 | 196.30435 | 15435.28 |
| GO_Biological_Process_2018 | viral transcription (GO:0019083) | down | 0 | 194.18377 | 15230.92 |
| GO_Cellular_Component_2018 | cytosolic ribosome (GO:0022626) | down | 0 | 173.54895 | 13262.83 |
| GO_Cellular_Component_2018 | cytosolic small ribosomal subunit (GO:0022627) | down | 0 | 248.07088 | 13887.70 |
| Azimuth_Cell_Types_2021 | CD8+ Memory/Effector T CL0000625 | down | 0 | 539.27027 | 13780.66 |
| Azimuth_Cell_Types_2021 | CD8 T CL0000625 | down | 0 | 539.27027 | 13780.66 |
If requested, we identify genes that are differentially expressed between two groups of cells. Groups can be defined by columns in the cell metadata. Different types of tests can be used and input data for testing can be the different assays as well as the computed dimensionality reductions. Resulting p-values are adjusted using the Bonferroni method. The names of differentially expressed genes per cluster, alongside statistical measures and additional gene annotation are written to file.
# We first compute the DEGs for all requested contrasts
# Prepare a list with contrasts (input can be R data.frame table or Excel file)
degs_contrasts_list = DegsSetupContrastsList(sc, param$deg_contrasts, param$latent_vars)
# Add the actual data to the list
degs_contrasts_list = purrr::map(degs_contrasts_list, function(contrast){
# If there were already errors, just return
if (length(contrast[["error_messages"]]) > 0) return(c(contrast, list(object=NULL, cells_group1_idx_subset=as.integer(), cells_group2_idx_subset=as.integer())))
# Get cells indices
cells_group1_idx = contrast[["cells_group1_idx"]]
cells_group2_idx = contrast[["cells_group2_idx"]]
# Create object
if (contrast[["use_reduction"]]) {
# Use dimensionality reduction
contrast[["object"]] = Seurat::Reductions(sc, slot=contrast[["assay"]])
} else {
# Use assay
contrast[["object"]] = Seurat::GetAssay(sc[,unique(c(cells_group1_idx, cells_group2_idx))], assay=contrast[["assay"]])
# This saves a lot of memory for parallelisation
if (contrast[["slot"]]!="scale.data") contrast[["object"]]@scale.data = new(Class="matrix")
}
# Variable latent vars must be passed as data.frame
if (!is.null(contrast[["latent_vars"]]) && length(contrast[["latent_vars"]]) > 0) {
contrast[["latent_vars"]] = sc[[unique(c(cells_group1_idx, cells_group2_idx)), contrast[["latent_vars"]], drop=FALSE]]
}
# Now update cell indices so that they match to subset
contrast[["cells_group1_idx_subset"]] = match(colnames(sc)[cells_group1_idx], colnames(contrast[["object"]]))
contrast[["cells_group2_idx_subset"]] = match(colnames(sc)[cells_group2_idx], colnames(contrast[["object"]]))
return(contrast)
})
# Compute the tests
# TODO: this chunk may be done in parallel in future
degs_contrasts_results = purrr::map(degs_contrasts_list, function(contrast) {
if (length(contrast$error_messages)==0) {
# No errors, do contrast
test_results = suppressWarnings(
DegsTestCellSets(object=contrast[["object"]],
slot=contrast[["slot"]],
cells_1=colnames(contrast[["object"]])[contrast[["cells_group1_idx_subset"]]],
cells_2=colnames(contrast[["object"]])[contrast[["cells_group2_idx_subset"]]],
is_reduction=contrast[["use_reduction"]],
logfc.threshold=contrast[["log2FC"]],
test.use=contrast[["test"]],
min.pct=contrast[["min_pct"]],
latent.vars=contrast[["latent_vars"]])
)
} else {
# Errors, return empty data.frame
test_results = DegsEmptyResultsTable()
}
# Sort and filter table
test_results = test_results %>% DegsSort() %>% DegsFilter(contrast[["log2FC"]], contrast[["padj"]], split_by_dir=FALSE)
# Add mean gene expression data (counts or data, dep on slot)
avg.1 = DegsAvgData(contrast[["object"]], cells=contrast[["cells_group1_idx_subset"]], genes=test_results$gene, slot=contrast[["slot"]])[,1]
avg.2 = DegsAvgData(contrast[["object"]], cells=contrast[["cells_group2_idx_subset"]], genes=test_results$gene, slot=contrast[["slot"]])[,1]
test_results = cbind(test_results, avg.1, avg.2)
# Add test results and drop unneccessary data
contrast = c(contrast, list(results=test_results))
contrast[["object"]] = NULL
contrast[["cells_group1_idx_subset"]] = NULL
contrast[["cells_group2_idx_subset"]] = NULL
return(contrast)
})
# Also remove objects from deg_contrasts_list (save memory)
degs_contrasts_list = purrr::map(degs_contrasts_list, function(contrast){ contrast[["object"]] = NULL; return(contrast)})# Use the existing variable and add Enrichr results
# Not in parallel due to server load
degs_contrasts_results = purrr::map(degs_contrasts_results, function(contrast) {
# Get results table
results_table = contrast$results
# Drop existing results
if ("enrichr" %in% names(contrast)) contrast[["enrichr"]] = NULL
# Split into up- and downregulated DEGs, then translate to Entrez gene, runEnrichr
degs_up = dplyr::filter(results_table, avg_log2FC > 0) %>% dplyr::pull(gene) %>% unique()
degs_up = sapply(degs_up, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
degs_up = degs_up[!is.na(degs_up)]
enrichr_results_up = EnrichrTest(genes=degs_up, databases=param$enrichr_dbs, padj=param$enrichr_padj)
degs_down = dplyr::filter(results_table, avg_log2FC < 0) %>% dplyr::pull(gene) %>% unique()
degs_down = sapply(degs_down, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
degs_down = degs_down[!is.na(degs_down)]
enrichr_results_down = EnrichrTest(genes=degs_down, databases=param$enrichr_dbs, padj=param$enrichr_padj)
# Flatten both enrichr results into tables
enrichr_results_up = purrr::map_dfr(names(enrichr_results_up), function(n) {
return(cbind(enrichr_results_up[[n]],
list(Database=factor(rep(n, nrow(enrichr_results_up[[n]])), levels=names(enrichr_results_up)),
Direction=factor(rep("up", nrow(enrichr_results_up[[n]])), levels=c("up", "down"))
)
))
})
enrichr_results_down = purrr::map_dfr(names(enrichr_results_down), function(n) {
return(cbind(enrichr_results_down[[n]],
list(Database=factor(rep(n, nrow(enrichr_results_down[[n]])), levels=names(enrichr_results_down)),
Direction=factor(rep("up", nrow(enrichr_results_down[[n]])), levels=c("up", "down"))
)
))
})
# Rbind and add factor levels
enrichr_results = rbind(enrichr_results_up, enrichr_results_down)
return(c(contrast, list(enrichr=enrichr_results)))
})# Now regroup list so that subsets are together again
original_contrast_rows = purrr::map_int(degs_contrasts_results, function(contrast){ return(contrast[["contrast_row"]]) })
degs = split(degs_contrasts_results, original_contrast_rows)
# Write degs to files
invisible(purrr::map_chr(degs, function(degs_subsets) {
# The output file
file = file.path(param$path_out, "marker_degs", paste0("degs_contrast_row_", degs_subsets[[1]][["contrast_row"]], "_results.xlsx"))
# Write degs
degs_subsets_results = purrr::map(degs_subsets, function(contrast) {return(contrast[["results"]])})
names(degs_subsets_results) = purrr::map_chr(degs_subsets, function(contrast) {return(ifelse(!is.na(contrast[["subset_group"]]), contrast[["subset_group"]], "All"))})
file = DegsWriteToFile(degs_subsets_results,
annot_ensembl=annot_ensembl,
gene_to_ensembl=seurat_rowname_to_ensembl,
file=file,
additional_readme=NULL)
return(file)
}))
invisible(purrr::map_chr(degs, function(degs_subsets) {
# The output file
file = file.path(param$path_out, "marker_degs", paste0("degs_contrast_row_", degs_subsets[[1]][["contrast_row"]], "_functions.xlsx"))
# Write Enrichr results
degs_subsets_enrichr = purrr::map(degs_subsets, function(contrast) {return(contrast[["enrichr"]])})
names(degs_subsets_enrichr) = purrr::map_chr(degs_subsets, function(contrast) {return(ifelse(!is.na(contrast[["subset_group"]]), contrast[["subset_group"]], "All"))})
file = EnrichrWriteResults(degs_subsets_enrichr, file=file)
return(file)
}))
# Add to sc object
Misc(sc, "degs") = degsfor (i in seq(degs)) {
for (j in seq(degs[[i]])) {
# If there are errors, skip
if (length(degs[[i]][[j]]$error_messages) > 0) {
next
}
# Average expression of all genes
x = subset(sc, cells=degs[[i]][[j]]$cells_group2_idx) %>% GetAssayData(slot="data")
x = log2(Matrix::rowMeans(expm1(x)) + 1)
y = subset(sc, cells=degs[[i]][[j]]$cells_group1_idx) %>% GetAssayData(slot="data")
y = log2(Matrix::rowMeans(expm1(y)) + 1)
sc_avg_log2FC = data.frame(x, y, col="none", gene=rownames(sc))
lims = c(min(c(x, y)), max(x, y))
## Color DEGs
up = degs[[i]][[j]]$results %>% dplyr::filter(avg_log2FC > 0) %>% dplyr::pull(gene)
if (length(up) > 0) sc_avg_log2FC[up, "col"] = "up"
down = degs[[i]][[j]]$results %>% dplyr::filter(avg_log2FC < 0) %>% dplyr::pull(gene)
if (length(down) > 0) sc_avg_log2FC[down, "col"] = "down"
## Plots
degs[[i]][[j]]$plot_scatter = ggplot(sc_avg_log2FC %>% dplyr::arrange(col, gene), aes(x=x, y=y, col=col)) +
geom_abline(slope=1, intercept=0, col="lightgrey") +
geom_abline(slope=1, intercept=c(-degs[[i]][[j]]$log2FC, degs[[i]][[j]]$log2FC), col="lightgrey", lty=2) +
geom_point() +
xlim(lims) + ylim(lims) +
AddStyle(ylab=degs[[i]][[j]]$condition_group1, xlab=degs[[i]][[j]]$condition_group2,
col=c(none="grey", up="darkgoldenrod1", down="steelblue"),
legend_position="bottom", legend_title="Filtered genes")
# Feature plot of top 4 genes, sorted by the p-value
degs_top = degs[[i]][[j]]$results %>% dplyr::top_n(n=-4, wt=p_val) %>% dplyr::top_n(n=-4, wt=avg_log2FC) %>% dplyr::pull(gene)
if (length(degs_top) > 0) {
p_list = Seurat::FeaturePlot(sc, features=degs_top, cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
for (p in seq(p_list)) p_list[[p]] = p_list[[p]] + AddStyle(legend_position="bottom", xlab="", ylab="")
degs[[i]][[j]]$plot_feature = patchwork::wrap_plots(p_list, ncol=ifelse(length(degs_top) > 1, 2, 1))
} else {
degs[[i]][[j]]$plot_feature = NULL
}
}
}knitr_header_string = "
## {{condition_column}}: {{condition_group1}} vs {{condition_group2}}
General configuration:
* assay: {{assay}}
* slot: {{slot}}
* test: {{test}}
* maximum adjusted p-value: {{padj}}
* minimum absolute log2 foldchange: {{log2FC}}
* minimum percentage of cells: {{min_pct}}
* latent vars: {{latent_vars}}
Subset on column: \'{{subset_column}}\'"
if (length(degs)==0) message("No DEG contrasts specified.")
for (i in seq(degs)) {
# Get subsets
degs_subsets = degs[[i]]
first_contrast = degs_subsets[[1]]
# Create header
cat(
knitr::knit_expand(text=knitr_header_string,
condition_column=first_contrast[["condition_column"]],
condition_group1=first_contrast[["condition_group1"]],
condition_group2=first_contrast[["condition_group2"]],
assay=first_contrast[["assay"]],
slot=first_contrast[["slot"]],
test=first_contrast[["test"]],
padj=first_contrast[["padj"]],
log2FC=first_contrast[["log2FC"]],
min_pct=first_contrast[["min_pct"]],
latent_vars=ifelse(!is.null(first_contrast[["latent_vars"]]), paste(colnames(first_contrast[["latent_vars"]]), collapse=","), "-"),
subset_column=ifelse(is.na(first_contrast[["subset_column"]]), "-", first_contrast[["subset_column"]]))
, "\n")
# Get error messages
error_messages = unique(purrr::flatten_chr(purrr::map(degs_subsets, function(contrast){return(contrast[["error_messages"]])})))
# Create combined results table
degs_subsets_results = purrr::map_dfr(degs_subsets, function(contrast) {
subset_group_value = ifelse(!is.na(contrast[["subset_group"]]), contrast[["subset_group"]], "All")
return(contrast[["results"]] %>%
dplyr::summarise(subset_group=subset_group_value,
Cells1=length(contrast[["cells_group1_idx"]]),
Cells2=length(contrast[["cells_group2_idx"]]),
DEGs=length(gene),
DEGs_up=sum(avg_log2FC > 0),
DEGs_down=sum(avg_log2FC < 0)))
}) %>% tibble::as_tibble()
# Print warnings/errors
if (length(error_messages) > 0) {
warning(error_messages)
}
# Print summary table
print(
knitr::kable(degs_subsets_results,
align="l", caption="DEG summary",
col.names=c("Subset", "Cells in group 1", "Cells in group 2", "# DEGs", "# DEGs upregulated", "# DEGs downregulated"),
format="html") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
)
# Print plots per contrast
for (contrast in seq(degs_subsets)) {
if (!"plot_scatter" %in% names(degs_subsets[[contrast]]) | !"plot_feature" %in% names(degs_subsets[[contrast]])) next
p = degs_subsets[[contrast]]$plot_scatter | degs_subsets[[contrast]]$plot_feature
title = "Scatterplot and feature plots"
if (!is.na(degs_subsets[[contrast]]$subset_column)) {
title = paste0(title, " (subset ", degs_subsets[[contrast]]$subset_column,
": ", degs_subsets[[contrast]]$subset_group, ")")
}
p = p + patchwork::plot_annotation(title=title)
print(p)
}
cat("\n \n")
} General configuration:
| Subset | Cells in group 1 | Cells in group 2 | # DEGs | # DEGs upregulated | # DEGs downregulated |
|---|---|---|---|---|---|
| All | 100 | 100 | 630 | 232 | 398 |
General configuration:
| Subset | Cells in group 1 | Cells in group 2 | # DEGs | # DEGs upregulated | # DEGs downregulated |
|---|---|---|---|---|---|
| 1 | 29 | 37 | 215 | 118 | 97 |
| 2 | 29 | 32 | 193 | 102 | 91 |
| 3 | 31 | 15 | 41 | 5 | 36 |
| 4 | 11 | 16 | 2 | 2 | 0 |
General configuration:
| Subset | Cells in group 1 | Cells in group 2 | # DEGs | # DEGs upregulated | # DEGs downregulated |
|---|---|---|---|---|---|
| 1 | 20 | 24 | 0 | 0 | 0 |
| 2 | 20 | 17 | 0 | 0 | 0 |
# Add colour lists for orig.dataset
col = GenerateColours(num_colours=length(levels(sc$orig.dataset)), names=levels(sc$orig.dataset), palette=param$col_palette_samples, alphas=1)
sc = ScAddLists(sc, lists=list(orig.dataset=col), lists_slot="colour_lists")
# Add experiment details
Seurat::Misc(sc, "experiment") = list(project_id=param$project_id, date=Sys.Date(), species=gsub("_gene_ensembl", "", param$mart_dataset))
# Add parameter
Seurat::Misc(sc, "parameters") = param
# Add technical output (note: cannot use Misc function here)
sc@misc$technical = data.frame(ScrnaseqSessionInfo(param$path_to_git))× (Message)
Registered S3 method overwritten by ‘cli’:
method from
print.boxx spatstat.geom
If all provided datasets are of type “10x,” we export the UMAP 2D visualisation, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser https://support.10xgenomics.com/single-cell-gene-expression/software/visualization/latest/what-is-loupe-cell-browser.
if (all(param$path_data$type == "10x")) {
# Export reductions (umap, pca, others)
for(r in Seurat::Reductions(sc)) {
write.table(Seurat::Embeddings(sc, reduction=r)[,1:2] %>% as.data.frame() %>% tibble::rownames_to_column(var="Barcode"),
file=file.path(param$path_out, "export", paste0("Loupe_projection_", r, ".csv")), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
}
# Export categorical metadata
loupe_meta = sc@meta.data
idx_keep = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
write.table(x=loupe_meta[, idx_keep] %>% tibble::rownames_to_column(var="barcode"),
file=file.path(param$path_out, "export", "Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
# Export gene sets (CC genes, known markers, per-cluster markers up- and downregulated, ...)
gene_lists = Misc(sc, "gene_lists")
# Remove empty gene sets
gene_lists = gene_lists[purrr::map_int(gene_lists, length) > 0]
loupe_genesets = purrr::map_dfr(names(gene_lists), function(n) {return(data.frame(List=n, Name=gene_lists[[n]]))})
loupe_genesets$Ensembl = seurat_rowname_to_ensembl[loupe_genesets$Name]
write.table(loupe_genesets, file=file.path(param$path_out, "export", "Loupe_genesets.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
}Result files are:
Projections can be imported in Loupe via “Import Projection,” cell meta data via “Import Categories” and gene sets via “Import Lists”
We export the assay data, cell metadata, clustering and visualisation in a format that can be read by the cellxgene browser https://github.com/chanzuckerberg/cellxgene.
# Convert Seurat single cell object to python anndata object which will be accessible via reticulate here
adata = sceasy::convertFormat(sc, from="seurat", to="anndata", outFile=NULL, assay=DefaultAssay(sc))
# Set up correct colours (see https://chanzuckerberg.github.io/cellxgene/posts/prepare) so that colours match
adata$uns = dict(
seurat_clusters_colors=np_array(unname(Misc(sc, "colour_lists")$seurat_clusters), dtype="<U7"),
orig.ident_colors=np_array(unname(Misc(sc, "colour_lists")$orig.ident), dtype="<U7"),
orig.dataset_colors=np_array(unname(Misc(sc, "colour_lists")$orig.dataset), dtype="<U7"),
Phase_colors=np_array(unname(Misc(sc, "colour_lists")$Phase), dtype="<U7")
)
# Write to h5ad file
adata$write(file.path(param$path_out, "export", "sc.h5ad"), compression="gzip")Result files are:
Copy/upload to data directory of your cellxgene browser
We export the assay data, clustering, visualisation, marker genes, enriched pathways and degs in a format that can be read by the Cerebro Browser https://github.com/romanhaa/cerebroApp/.
# Export to cerebro
res = ExportToCerebro(sc=sc,
path=file.path(param$path_out, "export", "sc.crb"),
assay=DefaultAssay(sc),
assay_raw=param$assay_raw,
delayed_array=FALSE)Result files are:
Load into Cerebro browser.
# Seurat object
saveRDS(sc, file=file.path(param$path_out, "data", "sc.rds"))
# Counts (RNA)
invisible(ExportSeuratAssayData(sc,
dir=file.path(param$path_out, "data", "counts"),
assays=param$assay_raw,
slot="counts",
include_cell_metadata_cols=colnames(sc[[]]),
metadata_prefix=paste0(param$project_id, ".")))
# Metadata
openxlsx::write.xlsx(x=sc[[]] %>% tibble::rownames_to_column(var="Barcode"), file=file.path(param$path_out, "data", "cell_metadata.xlsx"), rowNames=FALSE, colNames=TRUE)
# Annotation as excel file
openxlsx::write.xlsx(x=data.frame(seurat_id=rownames(sc), ensembl_gene_id=seurat_rowname_to_ensembl[rownames(sc)]) %>%
dplyr::inner_join(annot_ensembl, by="ensembl_gene_id"),
file=file.path(param$path_out, "annotation", "gene_annotation.xlsx"),
rowNames=FALSE, colNames=TRUE)
# Data and annotation for a subset of 500 cells for Morpheus
# ERROR: This gives an error with the newest packages, might be related to this:
# https://issueexplorer.com/issue/r-lib/cpp11/244
# For now, we outcomment this section, since it isn't really needed
#cells_subset = ScSampleCells(sc, n=500, seed=1)
#openxlsx::write.xlsx(Seurat::GetAssayData(sc[, cells_subset], slot="data") %>% as.data.frame() %>% tibble::rownames_to_column(var="gene"),
# file=file.path(param$path_out, "data", paste0("subset_", 500, "_cells_normalised_data.xlsx")),
# rowNames=FALSE, colNames=TRUE)
#openxlsx::write.xlsx(Seurat::GetAssayData(sc[, cells_subset], slot="scale.data") %>% as.data.frame() %>% #tibble::rownames_to_column(var="gene"),
# file=file.path(param$path_out, "data", paste0("subset_", 500, "_cells_scaled_data.xlsx")),
# rowNames=FALSE, colNames=TRUE)
#openxlsx::write.xlsx(sc[[]][cells_subset, ] %>% as.data.frame() %>% tibble::rownames_to_column(var="cell"),
# file=file.path(param$path_out, "data",paste0("subset_", 500, "_cells_column_annotations.xlsx")),
# rowNames=FALSE, colNames=TRUE)Result files are:
The following parameters were used to run the workflow.
out = ScrnaseqParamsInfo(params=param)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"), full_width=FALSE, position="left")| Name | Value |
|---|---|
| author | andpetzo |
| project_id | pbmc |
| path_data | name:pbmc_10x, pbmc_smartseq2; type:10x, smartseq2; path:test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/10x/, test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/smartseq2/counts_table.tsv.gz; stats:NA, NA; suffix:-1, -2 |
| assay_raw | RNA |
| downsample_cells_n | 100 |
| path_out | test_datasets/10x_SmartSeq2_pbmc_GSE132044/results/ |
| file_known_markers | test_datasets/10x_SmartSeq2_pbmc_GSE132044/known_markers.xlsx |
| mart_dataset | hsapiens_gene_ensembl |
| annot_version | 98 |
| annot_main | ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession |
| mart_attributes | ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description |
| biomart_mirror | www |
| mt | ^MT- |
| cell_filter | pbmc_10x:nFeature_RNA=c(NA, NA), percent_mt=c(NA, NA); pbmc_smartseq2_sample1:nFeature_RNA=c(NA, NA), percent_mt=c(NA, NA) |
| feature_filter | pbmc_10x:min_counts=1, min_cells=3; pbmc_smartseq2_sample1:min_counts=1, min_cells=3 |
| samples_min_cells | 10 |
| norm | RNA |
| cc_remove | FALSE |
| cc_remove_all | FALSE |
| cc_rescore_after_merge | TRUE |
| integrate_samples | method:integrate; dimensions:30; reference:; use_reciprocal_pca:FALSE |
| pc_n | 10 |
| cluster_k | 20 |
| umap_k | 30 |
| cluster_resolution_test | 0.3, 0.7 |
| cluster_resolution | 1 |
| marker_padj | 0.05 |
| marker_log2FC | 1 |
| marker_pct | 0.25 |
| deg_contrasts | condition_column:orig.ident, orig.ident, Phase; condition_group1:pbmc_10x, pbmc_10x, G1; condition_group2:pbmc_smartseq2_sample1, pbmc_smartseq2_sample1, G2M; subset_column:NA, seurat_clusters, seurat_clusters; subset_group:NA, , 1;2; downsample_cells_n:NA, 50, 30 |
| enrichr_site | Enrichr |
| enrichr_padj | 0.05 |
| enrichr_dbs | GO_Molecular_Function_2018, GO_Biological_Process_2018, GO_Cellular_Component_2018, Azimuth_Cell_Types_2021, CellMarker_Augmented_2021, Descartes_Cell_Types_and_Tissue_2021 |
| col | palevioletred |
| col_palette_samples | ggsci::pal_jama |
| col_palette_clusters | ggsci::pal_igv |
| path_to_git | . |
| debugging_mode | default_debugging |
| cores | 4 |
| file_annot | test_datasets/10x_SmartSeq2_pbmc_GSE132044/results//annotation/hsapiens_gene_ensembl.v98.annot.txt |
| file_cc_genes | test_datasets/10x_SmartSeq2_pbmc_GSE132044/results//annotation/cell_cycle_markers.xlsx |
| samples_to_drop | |
| col_samples | pbmc_10x=#374E55FF, pbmc_smartseq2_sample1=#DF8F44FF |
| latent_vars | orig.ident |
| col_clusters | 1=#5050FFFF, 2=#CE3D32FF, 3=#749B58FF, 4=#F0E685FF |
This report was created with generated using the scrnaseq GitHub repository. Software versions were collected at run time.
out = ScrnaseqSessionInfo(param$path_to_git)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))| Name | Value |
|---|---|
| Run on: | Wed Jan 26 12:57:52 2022 |
| ktrns/scrnaseq | d8bfa3e27ec278c2f8193b71f30a856589c811fb |
| Container | B, D, A |
| R | R version 4.0.5 (2021-03-31) |
| Platform | x86_64-conda-linux-gnu (64-bit) |
| Operating system | Debian GNU/Linux 11 (bullseye) |
| Host name | ngs10 |
| Host OS | #103-Ubuntu SMP Fri Nov 26 16:13:00 UTC 2021 (5.4.0-92-generic) |
| Packages | abind1.4-5, annotate1.68.0, AnnotationDbi1.52.0, ape5.6-1, askpass1.1, assertthat0.2.1, babelgene21.4, bibtex0.4.2.3, Biobase2.50.0, BiocFileCache1.14.0, BiocGenerics0.36.0, BiocParallel1.24.1, biomaRt2.46.3, bit4.0.4, bit644.0.5, bitops1.0-7, blob1.2.2, bslib0.3.1, cachem1.0.6, cerebroApp1.3.1, cli3.1.1, cluster2.1.2, codetools0.2-18, colorspace2.0-2, colourpicker1.1.1, cowplot1.1.1, crayon1.4.2, curl4.3.2, data.table1.14.2, DBI1.1.2, dbplyr2.1.1, DelayedArray0.16.3, deldir1.0-6, digest0.6.29, dplyr1.0.7, DT0.20, ellipsis0.3.2, enrichR3.0, evaluate0.14, fansi1.0.2, farver2.1.0, fastmap1.1.0, fitdistrplus1.1-6, fs1.5.2, future1.23.0, future.apply1.8.1, generics0.1.1, GenomeInfoDb1.26.4, GenomeInfoDbData1.2.4, GenomicRanges1.42.0, ggplot23.3.5, ggrepel0.9.1, ggridges0.5.3, ggsci2.9, globals0.14.0, glue1.6.1, goftest1.2-3, graph1.68.0, gridExtra2.3, GSEABase1.52.1, GSVA1.38.2, gtable0.3.0, highr0.9, hms1.1.1, htmltools0.5.2, htmlwidgets1.5.4, httpuv1.6.5, httr1.4.2, ica1.0-2, igraph1.2.11, IRanges2.24.1, irlba2.3.5, jquerylib0.1.4, jsonlite1.7.3, kableExtra1.3.4, KernSmooth2.23-20, knitcitations1.0.12, knitr1.37, labeling0.4.2, later1.2.0, lattice0.20-45, lazyeval0.2.2, leiden0.3.9, lifecycle1.0.1, limma3.46.0, listenv0.8.0, lmtest0.9-39, lubridate1.8.0, magrittr2.0.1, MASS7.3-55, MAST1.16.0, Matrix1.4-0, MatrixGenerics1.2.1, matrixStats0.61.0, memoise2.0.1, mgcv1.8-38, mime0.12, miniUI0.1.1.1, msigdbr7.4.1, munsell0.5.0, nlme3.1-155, openssl1.4.6, openxlsx4.2.5, parallelly1.30.0, patchwork1.1.1, pbapply1.5-0, pillar1.6.5, pkgconfig2.0.3, plotly4.10.0, plyr1.8.6, png0.1-7, polyclip1.10-0, prettyunits1.1.1, progress1.2.2, promises1.2.0.1, purrr0.3.4, qvalue2.22.0, R.methodsS31.8.1, R.oo1.24.0, R.utils2.11.0, R62.5.1, RANN2.6.1, rappdirs0.3.3, RColorBrewer1.1-2, Rcpp1.0.8, RcppAnnoy0.0.19, RCurl1.98-1.5, readr2.1.1, RefManageR1.3.5, reshape21.4.4, reticulate1.23, rjson0.2.21, rlang0.4.12, rmarkdown2.11, ROCR1.0-11, rpart4.1.16, RSpectra0.16-0, RSQLite2.2.8, rstudioapi0.13, Rtsne0.15, rvest1.0.2, S4Vectors0.28.1, sass0.4.0, scales1.1.1, scattermore0.7, sceasy0.0.6, sctransform0.3.3, sessioninfo1.2.2, Seurat4.1.0, SeuratObject4.0.4, shiny1.7.1, shinycssloaders1.0.0, shinydashboard0.7.2, shinyFiles0.9.1, shinyjs2.1.0, shinyWidgets0.6.3, SingleCellExperiment1.12.0, spatstat.core2.3-2, spatstat.data2.1-2, spatstat.geom2.3-1, spatstat.sparse2.1-0, spatstat.utils2.3-0, stringi1.7.6, stringr1.4.0, SummarizedExperiment1.20.0, survival3.2-13, svglite2.0.0, systemfonts1.0.3, tensor1.5, tibble3.1.6, tidyr1.1.4, tidyselect1.1.1, tzdb0.2.0, utf81.2.2, uwot0.1.11, vctrs0.3.8, viridis0.6.2, viridisLite0.4.0, vroom1.5.7, webshot0.5.2, withr2.4.3, xfun0.29, XML3.99-0.8, xml21.3.3, xtable1.8-4, XVector0.30.0, yaml2.2.2, zip2.2.0, zlibbioc1.36.0, zoo1.8-9 |
This Workflow was developed as part of the scrnaseq GitHub repository by Katrin Sameith and Andreas Petzold at the Dresden-concept Genome Center, TU Dresden (Dresden, Germany), in collaboration with Torsten Glomb, Maike Kosanke and Oliver Dittrich at the Research Core Unit Genomics, Hannover Medical School (Hannover, Germany). The Seurat Vignettes were initially used as templates.
# Writes knitcitations references to references.bib file.
knitcitations::write.bibtex(file=file.path(param$path_out, "references.bib"))